1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.h"
17
18 #include "absl/algorithm/container.h"
19 #include "absl/strings/str_cat.h"
20 #include "absl/strings/str_format.h"
21 #include "absl/time/time.h"
22 #include "absl/types/optional.h"
23 #include "tensorflow/compiler/xla/literal_util.h"
24 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
25 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
26 #include "tensorflow/compiler/xla/service/gpu/gpu_autotuning.pb.h"
27 #include "tensorflow/compiler/xla/service/gpu/hlo_algorithm_denylist.h"
28 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
29 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
30 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
31 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
32 #include "tensorflow/compiler/xla/status_macros.h"
33 #include "tensorflow/compiler/xla/util.h"
34 #include "tensorflow/core/lib/strings/numbers.h"
35 #include "tensorflow/core/platform/logger.h"
36 #include "tensorflow/core/platform/mutex.h"
37 #include "tensorflow/core/util/env_var.h"
38 #include "tensorflow/core/util/proto/proto_utils.h"
39
40 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
41 #include "tensorflow/compiler/xla/service/gpu/buffer_comparator.h"
42 #include "tensorflow/stream_executor/gpu/redzone_allocator.h"
43 #endif
44
45 namespace xla {
46 namespace gpu {
47 namespace {
48
49 using absl::optional;
50 using se::DeviceMemoryBase;
51 using se::dnn::AlgorithmDesc;
52 using tensorflow::AutotuneResult;
53
54 class ScratchAllocator : public se::ScratchAllocator {
55 public:
ScratchAllocator(int device_ordinal,se::DeviceMemoryAllocator * memory_allocator)56 ScratchAllocator(int device_ordinal,
57 se::DeviceMemoryAllocator* memory_allocator)
58 : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
59
GetMemoryLimitInBytes()60 int64 GetMemoryLimitInBytes() override {
61 return 1LL << 32; // 4GB. TODO(jlebar): Tune this?
62 }
TotalAllocatedBytes()63 int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
64
65 StatusOr<se::DeviceMemory<uint8>> AllocateBytes(int64 byte_size) override;
66
67 template <typename T>
Allocate(int64 num_elements)68 StatusOr<se::DeviceMemory<T>> Allocate(int64 num_elements) {
69 TF_ASSIGN_OR_RETURN(se::DeviceMemory<uint8> bytes,
70 AllocateBytes(num_elements * sizeof(T)));
71 return se::DeviceMemory<T>(bytes);
72 }
73
74 private:
75 const int device_ordinal_;
76 se::DeviceMemoryAllocator* memory_allocator_;
77 std::vector<se::OwningDeviceMemory> allocated_buffers_;
78 int64 total_allocated_bytes_ = 0;
79 };
80
AllocateBytes(int64 byte_size)81 StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
82 int64 byte_size) {
83 CHECK_GE(byte_size, 0) << "byte_size must be positive.";
84 if (byte_size > GetMemoryLimitInBytes()) {
85 return se::port::Status(
86 se::port::error::RESOURCE_EXHAUSTED,
87 absl::StrFormat(
88 "Allocating %d bytes exceeds the memory limit of %d bytes.",
89 byte_size, GetMemoryLimitInBytes()));
90 }
91
92 TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
93 memory_allocator_->Allocate(device_ordinal_, byte_size,
94 /*retry_on_failure=*/false));
95 total_allocated_bytes_ += byte_size;
96
97 se::DeviceMemoryBase buffer_addr = *allocated_buffer;
98 allocated_buffers_.push_back(std::move(allocated_buffer));
99 return se::DeviceMemory<uint8>(buffer_addr);
100 }
101
GetAlgorithms(CudnnConvKind kind,se::StreamExecutor * stream_exec)102 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
103 se::StreamExecutor* stream_exec) {
104 std::vector<AlgorithmDesc> algorithms;
105 bool succ = false;
106 switch (kind) {
107 case CudnnConvKind::kBackwardFilter:
108 succ =
109 stream_exec->GetConvolveBackwardFilterAlgorithms(true, &algorithms);
110 break;
111 case CudnnConvKind::kBackwardInput:
112 succ = stream_exec->GetConvolveBackwardDataAlgorithms(true, &algorithms);
113 break;
114 case CudnnConvKind::kForward:
115 case CudnnConvKind::kForwardActivation:
116 succ = stream_exec->GetConvolveAlgorithms(true, &algorithms);
117 break;
118 }
119 DCHECK(succ);
120
121 return algorithms;
122 }
123
GetMIOpenAlgorithms(const HloCustomCallInstruction * instr,absl::Span<se::DeviceMemoryBase> operand_buffers,se::DeviceMemoryBase result_buffer,se::StreamExecutor * stream_exec,ScratchAllocator * scratch_allocator,se::Stream * stream)124 StatusOr<std::vector<se::dnn::ProfileResult>> GetMIOpenAlgorithms(
125 const HloCustomCallInstruction* instr,
126 absl::Span<se::DeviceMemoryBase> operand_buffers,
127 se::DeviceMemoryBase result_buffer, se::StreamExecutor* stream_exec,
128 ScratchAllocator* scratch_allocator, se::Stream* stream) {
129 std::vector<se::dnn::ProfileResult> algorithms;
130
131 TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
132
133 TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
134 GetDNNConvKindFromCudnnConvKind(config.kind));
135
136 TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype,
137 GetDNNDataTypeFromPrimitiveType(config.output_type));
138
139 TF_ASSIGN_OR_RETURN(GpuConvParams params,
140 GetGpuConvParams(config, operand_buffers, result_buffer));
141
142 bool succ = stream_exec->GetMIOpenConvolveAlgorithms(
143 kind, dtype, stream, params.config.input_descriptor, params.input_buf,
144 params.config.filter_descriptor, params.filter_buf,
145 params.config.output_descriptor, params.output_buf,
146 params.config.conv_desc, scratch_allocator, &algorithms);
147 DCHECK(succ);
148
149 return algorithms;
150 }
151
AlgorithmToString(const AlgorithmDesc & algo)152 string AlgorithmToString(const AlgorithmDesc& algo) {
153 if (algo.tensor_ops_enabled()) {
154 return absl::StrCat(algo.algo_id(), "+TC");
155 }
156 return absl::StrCat(algo.algo_id());
157 }
158
NumBytesToString(int64 bytes)159 string NumBytesToString(int64 bytes) {
160 return absl::StrCat(tensorflow::strings::HumanReadableNumBytes(bytes), " (",
161 bytes, "B)");
162 }
163
GetCudnnVersion(se::StreamExecutor * stream_executor)164 tensorflow::CudnnVersion GetCudnnVersion(se::StreamExecutor* stream_executor) {
165 tensorflow::CudnnVersion cudnn_version;
166 if (auto* dnn = stream_executor->AsDnn()) {
167 StatusOr<se::dnn::VersionInfo> version_or = dnn->GetVersion();
168 if (version_or.ok()) {
169 const auto& version = version_or.ValueOrDie();
170 cudnn_version.set_major(version.major_version());
171 cudnn_version.set_minor(version.minor_version());
172 cudnn_version.set_patch(version.patch());
173 }
174 }
175 return cudnn_version;
176 }
177
GetComputeCapability(se::StreamExecutor * stream_executor)178 tensorflow::ComputeCapability GetComputeCapability(
179 se::StreamExecutor* stream_executor) {
180 tensorflow::ComputeCapability cc;
181 int cc_major, cc_minor;
182 stream_executor->GetDeviceDescription().cuda_compute_capability(&cc_major,
183 &cc_minor);
184 cc.set_major(cc_major);
185 cc.set_minor(cc_minor);
186 return cc;
187 }
188
PrintPlatformInfo(const se::Stream * stream)189 void PrintPlatformInfo(const se::Stream* stream) {
190 auto* se = stream->parent();
191 const auto& desc = se->GetDeviceDescription();
192 LOG(ERROR) << "Device: " << desc.name();
193 LOG(ERROR) << "Platform: " << desc.platform_version();
194 LOG(ERROR) << "Driver: " << desc.driver_version();
195 LOG(ERROR) << "Runtime: " << desc.runtime_version();
196
197 auto* dnn = se->AsDnn();
198 if (dnn) {
199 auto dnn_version = dnn->GetVersion();
200 if (dnn_version.ok()) {
201 auto v = dnn_version.ValueOrDie();
202 LOG(ERROR) << "cudnn version: " << v.major_version() << "."
203 << v.minor_version() << "." << v.patch();
204 }
205 }
206 }
207
208 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
209 // Returns true if the redzones in `allocator`'s allocations are unmodified.
210 //
211 // If the redzones are modified, logs an error, sets the appropriate failure
212 // bits on `result`, and returns false.
213 //
214 // Returns a status if an unexpected error has occurred, and the stream
215 // has been poisoned.
216 //
217 // `name` is a user-friendly name for the set of redzones being checked, e.g.
218 // "input/output" or "scratch".
CheckRedzones(const se::RedzoneAllocator & allocator,se::Stream * stream,absl::string_view name,const HloInstruction * instr,AutotuneResult * result)219 StatusOr<bool> CheckRedzones(const se::RedzoneAllocator& allocator,
220 se::Stream* stream, absl::string_view name,
221 const HloInstruction* instr,
222 AutotuneResult* result) {
223 XLA_SCOPED_LOGGING_TIMER_LEVEL("CudnnConvAlgorithmPicker checking redzones",
224 2);
225 using RedzoneCheckStatus = se::RedzoneAllocator::RedzoneCheckStatus;
226 TF_ASSIGN_OR_RETURN(RedzoneCheckStatus redzone_check,
227 allocator.CheckRedzones());
228 if (redzone_check.ok()) {
229 return true;
230 }
231
232 auto* fail = result->mutable_failure();
233 fail->set_kind(AutotuneResult::REDZONE_MODIFIED);
234 *fail->mutable_msg() = redzone_check.RedzoneFailureMsg();
235 fail->set_buffer_address(
236 reinterpret_cast<uint64>(redzone_check.user_buffer_address));
237
238 LOG(ERROR) << absl::StreamFormat(
239 "Detected cudnn out-of-bounds write in conv %s buffer! This is likely a "
240 "cudnn bug. We will skip this algorithm in the future, but your GPU "
241 "state may already be corrupted, leading to incorrect results. Within "
242 "Google, no action is needed on your part. Outside of Google, please "
243 "ensure you're running the latest version of cudnn. If that doesn't fix "
244 "the problem, please file a bug with this full error message and we'll "
245 "contact nvidia.",
246 name);
247 LOG(ERROR) << redzone_check.RedzoneFailureMsg();
248 LOG(ERROR) << "HloInstruction " << instr->ToString();
249 PrintPlatformInfo(stream);
250 return false;
251 }
252 #endif
253
254 using ConvCacheKey =
255 std::tuple<se::StreamExecutor*,
256 /* conv->ToString(HloPrintOptions::Canonical()) */ std::string>;
257
258 struct ConvCacheStats {
259 int64 cache_hits = 0;
260 int64 cache_misses = 0;
261
LogStatsxla::gpu::__anon559be1860111::ConvCacheStats262 void LogStats() {
263 VLOG(2) << "Cache hits: " << cache_hits;
264 VLOG(2) << "Cache misses: " << cache_misses;
265 }
266 };
267
AutotuneCacheKeyfromInstruction(const HloCustomCallInstruction * conv,se::StreamExecutor * se)268 ConvCacheKey AutotuneCacheKeyfromInstruction(
269 const HloCustomCallInstruction* conv, se::StreamExecutor* se) {
270 auto options = HloPrintOptions::Canonical();
271 options.set_print_backend_config(true);
272 return std::make_tuple(se, conv->ToString(options));
273 }
274
275 tensorflow::mutex autotune_cache_lock(tensorflow::LINKER_INITIALIZED);
276 auto& autotune_cache TF_GUARDED_BY(autotune_cache_lock) =
277 *new absl::flat_hash_map<ConvCacheKey, AutotuneResult>();
278 auto& autotune_cache_stats TF_GUARDED_BY(autotune_cache_lock) =
279 *new ConvCacheStats();
280 } // anonymous namespace
281
PickBestAlgorithm(const HloCustomCallInstruction * instr)282 StatusOr<AutotuneResult> GpuConvAlgorithmPicker::PickBestAlgorithm(
283 const HloCustomCallInstruction* instr) {
284 // Don't run this function concurrently on the same GPU.
285 //
286 // This is a bit of a hack and doesn't protect us against arbitrary concurrent
287 // use of a GPU, but it's sufficient to let us compile two HLO modules
288 // concurrently and then run them sequentially.
289 //
290 // Putting the lock in here rather than in PickBestAlgorithmNoCache lets us
291 // avoid ever doing duplicate work. If we have a cache miss, only one thread
292 // will run PickBestAlgorithmImpl for a particular device.
293 tensorflow::mutex_lock lock = LockGpu(stream_exec_);
294
295 // We cache the autotuning results to avoid doing the duplicate work,
296 // which can greatly improve both stability (deterministic numeric results
297 // within a process for a given input) and performance (2x speedup on some
298 // models).
299 ConvCacheKey key = AutotuneCacheKeyfromInstruction(instr, stream_exec_);
300 {
301 tensorflow::mutex_lock lock(autotune_cache_lock);
302 auto it = autotune_cache.find(key);
303 if (it != autotune_cache.end()) {
304 autotune_cache_stats.cache_hits++;
305 return it->second;
306 }
307 autotune_cache_stats.cache_misses++;
308 }
309
310 // Make sure any previous activity on this executor is done. We don't want to
311 // interfere with programs that are still running on the GPU.
312 if (!stream_exec_->SynchronizeAllActivity()) {
313 return InternalError("Failed to synchronize GPU for autotuning.");
314 }
315
316 // allocator either points to this->allocator_ or, if that's null, to a
317 // se::StreamExecutorMemoryAllocator for stream_exec_.
318 se::DeviceMemoryAllocator* allocator;
319 optional<se::StreamExecutorMemoryAllocator> se_allocator;
320 if (allocator_ != nullptr) {
321 allocator = allocator_;
322 } else {
323 se_allocator.emplace(stream_exec_);
324 allocator = &*se_allocator;
325 }
326
327 TF_ASSIGN_OR_RETURN(se::Stream* const stream,
328 allocator->GetStream(stream_exec_->device_ordinal()));
329 StatusOr<AutotuneResult> result_or(InternalError("Unknown platform."));
330 // Check StreamExecutor on which platform it is. ROCm and Cuda implementation
331 // have diverged. Specifically, we need to make sure redzone allocator related
332 // utilities are not used in ROCm routine
333 if (stream_exec_->platform_kind() == se::PlatformKind::kROCm) {
334 result_or = PickBestAlgorithmNoCacheRocm(instr, allocator, stream);
335 } else if (stream_exec_->platform_kind() == se::PlatformKind::kCuda) {
336 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
337 result_or = PickBestAlgorithmNoCacheCuda(instr, allocator, stream);
338 #endif
339 }
340
341 if (result_or.ok()) {
342 tensorflow::mutex_lock lock(autotune_cache_lock);
343 CHECK(autotune_cache.insert({key, result_or.ValueOrDie()}).second);
344 }
345 return result_or;
346 }
347
348 // The following function allows deterministic ops to be implemented relatively
349 // quickly using environment variables. It is intended to be temporary. The
350 // longer-term intention is to enable deterministic ops via tf.config and
351 // appropriate plumbing. See the discussion on PR 34951 for more information:
352 // https://github.com/tensorflow/tensorflow/pull/34951#discussion_r355682316
353 // This function and associated comment are replicated in the following three
354 // places:
355 // 1. tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
356 // 2. tensorflow/core/kernels/gpu_utils.cc
357 // 3. tensorflow/stream_executor/cuda/cuda_dnn.cc
358 // When implementing the plumbing, you should also search for the use of
359 // TF_DETERMINISTIC_OPS on its own.
360 // TODO(duncanriach): move to an API that uses tf.config and implement the first
361 // phase of plumbing.
RequireCudnnDeterminism()362 static bool RequireCudnnDeterminism() {
363 static bool require_cudnn_determinism = [] {
364 bool deterministic_ops = false;
365 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
366 /*default_val=*/false,
367 &deterministic_ops));
368 bool cudnn_deterministic = false;
369 TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_CUDNN_DETERMINISTIC",
370 /*default_val=*/false,
371 &cudnn_deterministic));
372 return deterministic_ops || cudnn_deterministic;
373 }();
374 return require_cudnn_determinism;
375 }
376
377 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
378 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheCuda(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)379 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
380 const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
381 se::Stream* stream) {
382 // Right now Redzone allocator is available in Cuda target only
383 XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
384 "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
385
386 const Shape& result_shape = instr->shape().tuple_shapes(0);
387 int64 rng_state = 0;
388
389 const HloModuleConfig& hlo_module_config = instr->GetModule()->config();
390 const int32 conv_autotune_level =
391 hlo_module_config.debug_options().xla_gpu_autotune_level();
392 const bool init_conv_data = conv_autotune_level > 1;
393 const bool check_conv = conv_autotune_level > 3;
394 const auto initialize_buffer = [init_conv_data, &stream, &rng_state](
395 DeviceMemoryBase buffer,
396 const Shape& buffer_shape) {
397 if (init_conv_data) {
398 InitializeBuffer(stream, buffer_shape.element_type(), &rng_state, buffer);
399 }
400 };
401
402 // Allocate space for the input, filter, and output of the convolution.
403 se::RedzoneAllocator input_output_allocator(
404 stream, allocator, PtxOptsFromConfig(hlo_module_config));
405 std::vector<se::DeviceMemoryBase> operand_buffers;
406 for (const auto* operand : instr->operands()) {
407 TF_ASSIGN_OR_RETURN(auto buffer,
408 input_output_allocator.AllocateBytes(
409 ShapeUtil::ByteSizeOf(operand->shape())));
410 initialize_buffer(buffer, operand->shape());
411 operand_buffers.push_back(buffer);
412 }
413 TF_ASSIGN_OR_RETURN(auto result_buffer,
414 input_output_allocator.AllocateBytes(
415 ShapeUtil::ByteSizeOf(result_shape)));
416 initialize_buffer(result_buffer, result_shape);
417
418 TF_ASSIGN_OR_RETURN(auto backend_config,
419 instr->backend_config<CudnnConvBackendConfig>());
420
421 optional<BufferComparator> comparator;
422 // Use the first algorithm that's supported as reference. There isn't a
423 // particular reason to use it, as any algorithm suffices. It doesn't make
424 // this algorithm considered correct, though.
425 se::DeviceMemoryBase reference_result_buffer;
426 AlgorithmDesc first_algorithm;
427
428 TF_ASSIGN_OR_RETURN(CudnnConvKind kind, GetCudnnConvKind(instr));
429 std::vector<AutotuneResult> profile_results;
430
431 const DebugOptions& debug_options =
432 instr->GetModule()->config().debug_options();
433
434 const bool crash_on_checking_failure =
435 debug_options.xla_gpu_crash_on_verification_failures();
436
437 const auto canonical_hlo =
438 std::get<1>(AutotuneCacheKeyfromInstruction(instr, stream_exec_));
439
440 string blas_version;
441 if (auto* blas = stream_exec_->AsBlas()) {
442 (void)blas->GetVersion(&blas_version);
443 }
444
445 absl::Span<const AlgorithmDesc> disabled_algos = GetDisabledConvAlgorithms(
446 GetComputeCapability(stream_exec_), GetCudnnVersion(stream_exec_),
447 blas_version, canonical_hlo);
448
449 TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
450
451 for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
452 XLA_SCOPED_LOGGING_TIMER_LEVEL(
453 absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
454 AlgorithmToString(alg)),
455 2);
456
457 if (absl::c_linear_search(disabled_algos, alg)) {
458 LOG(INFO) << "Omitted potentially buggy algorithm "
459 << AlgorithmToString(alg) << " for conv " << instr->ToString();
460 continue;
461 }
462
463 se::RedzoneAllocator scratch_allocator(
464 stream, allocator, PtxOptsFromConfig(hlo_module_config));
465 se::dnn::ProfileResult profile_result;
466 VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
467 << instr->ToString();
468
469 // Use assignment instead of brace-list to make GCC 4.9 happy.
470 RunConvOptions options;
471 options.profile_result = &profile_result;
472 options.algo_override = alg;
473 Status launch_status =
474 RunGpuConv(config, absl::MakeSpan(operand_buffers), result_buffer,
475 &scratch_allocator, stream, options);
476
477 if (!launch_status.ok()) {
478 continue;
479 }
480
481 if (!profile_result.is_valid()) {
482 continue;
483 }
484
485 profile_results.emplace_back();
486 AutotuneResult& result = profile_results.back();
487 result.mutable_conv()->set_algorithm(alg.algo_id());
488 result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
489
490 int64 scratch_bytes_used =
491 scratch_allocator.TotalAllocatedBytesExcludingRedzones();
492 result.set_scratch_bytes(scratch_bytes_used);
493 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
494 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
495
496 if (!check_conv) {
497 continue;
498 }
499
500 // Check for writes to redzones.
501 TF_ASSIGN_OR_RETURN(bool input_output_allocator_redzone_clear,
502 CheckRedzones(input_output_allocator, stream,
503 "input/output", instr, &result));
504
505 TF_ASSIGN_OR_RETURN(
506 bool scratch_allocator_redzone_clear,
507 CheckRedzones(scratch_allocator, stream, "scratch", instr, &result));
508
509 if (!input_output_allocator_redzone_clear ||
510 !scratch_allocator_redzone_clear) {
511 AlgorithmDenylist proto;
512 auto entry = proto.add_entries();
513 entry->set_hlo(canonical_hlo);
514 *entry->mutable_cc() = GetComputeCapability(stream_exec_);
515 *entry->mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
516 entry->set_blas_version(blas_version);
517 auto algo = entry->add_algos();
518 algo->set_id(alg.algo_id());
519 algo->set_tensor_ops(alg.tensor_ops_enabled());
520
521 LOG(ERROR) << "To denylist this algorithm for this convolution, "
522 "copy-paste the following "
523 "proto to the denylist file pointed by XLA_FLAGS "
524 "--xla_gpu_algorithm_denylist_path="
525 << GetDebugOptionsFromFlags().xla_gpu_algorithm_denylist_path()
526 << " : " << proto.ShortDebugString();
527 continue;
528 }
529
530 if (comparator.has_value()) {
531 XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::CompareEqual", 2);
532 StatusOr<bool> compare_result = comparator->CompareEqual(
533 stream, reference_result_buffer, result_buffer);
534 if (!compare_result.ok()) {
535 LOG(ERROR) << "Unable to compare " << AlgorithmToString(first_algorithm)
536 << " against " << AlgorithmToString(alg) << " for "
537 << instr->ToString() << ": " << compare_result.status();
538 if (compare_result.status().code() ==
539 tensorflow::error::RESOURCE_EXHAUSTED) {
540 // Possibly OOM. Propagate the error.
541 return compare_result.status();
542 }
543 CHECK(!crash_on_checking_failure);
544 } else if (!compare_result.ValueOrDie()) {
545 LOG(ERROR)
546 << "Results mismatch between different convolution algorithms. "
547 "This is likely a bug/unexpected loss of precision in cudnn.\n"
548 << instr->ToString() << " for "
549 << AlgorithmToString(first_algorithm) << " vs "
550 << AlgorithmToString(alg);
551 PrintPlatformInfo(stream);
552 VLOG(1) << "Full module on failure: \n"
553 << instr->GetModule()->ToString();
554 auto* fail = result.mutable_failure();
555 fail->set_kind(AutotuneResult::WRONG_RESULT);
556 fail->set_buffer_address(
557 reinterpret_cast<uint64>(result_buffer.opaque()));
558 auto* reference_conv = fail->mutable_reference_conv();
559 reference_conv->set_algorithm(first_algorithm.algo_id());
560 reference_conv->set_tensor_ops_enabled(
561 first_algorithm.tensor_ops_enabled());
562 }
563 } else {
564 XLA_SCOPED_LOGGING_TIMER_LEVEL("BufferComparator::Create", 2);
565 comparator.emplace(result_shape, hlo_module_config);
566 TF_ASSIGN_OR_RETURN(
567 reference_result_buffer,
568 input_output_allocator.AllocateBytes(result_buffer.size()));
569 stream->ThenMemcpy(&reference_result_buffer, result_buffer,
570 result_buffer.size());
571 first_algorithm = alg;
572 }
573 }
574
575 // Log the autotuning result.
576 {
577 tensorflow::AutotuningLog log;
578 {
579 ConvInstructionLog instr_log;
580 *instr_log.mutable_instruction() = instr->ToProto();
581 for (int i = 0; i < instr->operand_count(); i++) {
582 *instr_log.add_operand_shapes() = instr->operand(i)->shape().ToProto();
583 instr_log.add_operand_addresses(
584 reinterpret_cast<uint64>(operand_buffers[i].opaque()));
585 }
586 instr_log.set_result_address(
587 reinterpret_cast<uint64>(result_buffer.opaque()));
588 log.mutable_instr()->PackFrom(instr_log);
589 }
590 for (const auto& profile : profile_results) {
591 *log.add_results() = profile;
592 }
593 *log.mutable_compute_capability() = GetComputeCapability(stream_exec_);
594 *log.mutable_cudnn_version() = GetCudnnVersion(stream_exec_);
595 log.set_device_pci_bus_id(
596 stream_exec_->GetDeviceDescription().pci_bus_id());
597 log.set_blas_version(blas_version);
598 VLOG(1) << "Autotuning result: " << log.ShortDebugString();
599 // If we crash on checking failure, we are in a testing/benchmark mode, thus
600 // omitting logging through the logger.
601 if (!crash_on_checking_failure) {
602 tensorflow::Logger::GetSingleton()->LogProto(log);
603 }
604 }
605
606 // Crash on miscompares and redzone violations if desired. Do this after
607 // logging the autotuning results, otherwise we won't get any data!
608 for (const auto& result : profile_results) {
609 if (result.has_failure()) {
610 CHECK(!crash_on_checking_failure);
611 }
612 }
613
614 // Choose the fastest convolution that doesn't produce a REDZONE_MODIFIED
615 // error.
616 //
617 // TODO(jlebar): We ought to be able to detect redzone reads by noticing NaNs
618 // in the output of the conv and skip those.
619 //
620 // For now, we ignore WRONG_RESULT failures because false-positives are
621 // possible (e.g. perhaps the reference algorithm is the one that's
622 // incorrect!). But we don't ignore REDZONE_MODIFIED failures because they're
623 // quite severe and can be detected with high accuracy.
624 std::vector<AutotuneResult> filtered_results;
625 absl::c_copy_if(
626 profile_results, std::back_inserter(filtered_results),
627 [](const AutotuneResult& r) {
628 return !(r.has_failure() &&
629 r.failure().kind() != AutotuneResult::WRONG_RESULT);
630 });
631 if (filtered_results.empty()) {
632 return InternalError(
633 "All algorithms tried for convolution %s failed. Falling back to "
634 "default algorithm. ",
635 instr->ToString());
636 }
637
638 auto selected_result = filtered_results.begin();
639 if (!RequireCudnnDeterminism() &&
640 !hlo_module_config.debug_options().xla_gpu_deterministic_ops()) {
641 selected_result = absl::c_min_element(
642 filtered_results,
643 [](const AutotuneResult& lhs, const AutotuneResult& rhs) {
644 return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
645 tensorflow::proto_utils::FromDurationProto(rhs.run_time());
646 });
647 }
648
649 return *selected_result;
650 }
651 #endif
652
653 StatusOr<tensorflow::AutotuneResult>
PickBestAlgorithmNoCacheRocm(const HloCustomCallInstruction * instr,se::DeviceMemoryAllocator * allocator,se::Stream * stream)654 GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
655 const HloCustomCallInstruction* instr, se::DeviceMemoryAllocator* allocator,
656 se::Stream* stream) {
657 XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
658 "GpuConvAlgorithmPicker::PickBestAlgorithmImpl for ", instr->ToString()));
659
660 const auto device_ordinal = stream_exec_->device_ordinal();
661 std::vector<se::DeviceMemoryBase> operand_buffers;
662
663 ScratchAllocator input_output_allocator(device_ordinal, allocator);
664 const auto initialize_buffer = [stream](DeviceMemoryBase buffer) {
665 // Although we don't have evidence this matters, zero out the buffers
666 // before autotuning. It's conceivable that using uninitialized memory as
667 // the inputs might affect performance if e.g. the inputs contain
668 // denormals, and this is easy enough.
669 stream->ThenMemZero(&buffer, buffer.size());
670 };
671
672 // Allocate space for the input, filter, and output of the convolution. We
673 // use a ScratchAllocator for this instead of calling allocator_ directly so
674 // that our allocations don't leak.
675 for (const auto* operand : instr->operands()) {
676 TF_ASSIGN_OR_RETURN(auto buffer,
677 input_output_allocator.AllocateBytes(
678 ShapeUtil::ByteSizeOf(operand->shape())));
679 initialize_buffer(buffer);
680 operand_buffers.push_back(buffer);
681 }
682
683 TF_ASSIGN_OR_RETURN(
684 auto result_buffer,
685 input_output_allocator.AllocateBytes(
686 ShapeUtil::ByteSizeOf(instr->shape().tuple_shapes(0))));
687 initialize_buffer(result_buffer);
688
689 ScratchAllocator scratch_allocator(device_ordinal, allocator);
690
691 TF_ASSIGN_OR_RETURN(
692 std::vector<se::dnn::ProfileResult> algorithms,
693 GetMIOpenAlgorithms(instr, absl::MakeSpan(operand_buffers), result_buffer,
694 stream_exec_, &scratch_allocator, stream));
695
696 std::vector<AutotuneResult> profile_results;
697
698 if (algorithms.size() == 1) {
699 auto profile_result = algorithms[0];
700 profile_results.emplace_back();
701 auto& result = profile_results.back();
702 result.mutable_conv()->set_algorithm(profile_result.algorithm().algo_id());
703 result.mutable_conv()->set_tensor_ops_enabled(
704 profile_result.algorithm().tensor_ops_enabled());
705
706 result.set_scratch_bytes(profile_result.scratch_size());
707 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
708 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
709 } else {
710 TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
711 for (const auto& miopen_alg : algorithms) {
712 const auto& alg = miopen_alg.algorithm();
713 XLA_SCOPED_LOGGING_TIMER_LEVEL(
714 absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
715 AlgorithmToString(alg)),
716 2);
717
718 se::dnn::ProfileResult profile_result;
719 VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
720 << instr->ToString();
721
722 // Use assignment instead of brace-list to make GCC 4.9 happy.
723 RunConvOptions options;
724 options.profile_result = &profile_result;
725 options.algo_override = alg;
726 options.scratch_size_override = miopen_alg.scratch_size();
727 Status launch_status =
728 RunGpuConv(config, absl::MakeSpan(operand_buffers), result_buffer,
729 &scratch_allocator, stream, options);
730
731 if (!launch_status.ok()) {
732 continue;
733 }
734
735 if (!profile_result.is_valid()) {
736 continue;
737 }
738
739 profile_results.emplace_back();
740 AutotuneResult& result = profile_results.back();
741 result.mutable_conv()->set_algorithm(alg.algo_id());
742 result.mutable_conv()->set_tensor_ops_enabled(alg.tensor_ops_enabled());
743
744 int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
745 result.set_scratch_bytes(scratch_bytes_used);
746 *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
747 absl::Milliseconds(profile_result.elapsed_time_in_ms()));
748 }
749 }
750 const auto& best_result = absl::c_min_element(
751 profile_results,
752 [&](const AutotuneResult& lhs, const AutotuneResult& rhs) {
753 return tensorflow::proto_utils::FromDurationProto(lhs.run_time()) <
754 tensorflow::proto_utils::FromDurationProto(rhs.run_time());
755 });
756
757 if (best_result != profile_results.end()) {
758 return *best_result;
759 }
760
761 return InternalError(
762 "All algorithms tried for convolution %s failed. Falling back to "
763 "default algorithm.",
764 instr->ToString());
765 }
766
RunOnInstruction(HloInstruction * instr)767 StatusOr<bool> GpuConvAlgorithmPicker::RunOnInstruction(HloInstruction* instr) {
768 CHECK(IsCustomCallToDnnConvolution(*instr));
769
770 StatusOr<AutotuneResult> best_algo_or =
771 PickBestAlgorithm(Cast<HloCustomCallInstruction>(instr));
772 if (!best_algo_or.ok()) {
773 LOG(WARNING) << "Failed to determine best cudnn convolution algorithm: "
774 << best_algo_or.status()
775 << "\n\nConvolution performance may be suboptimal.";
776 return false;
777 }
778
779 auto best_algo = std::move(best_algo_or).ValueOrDie();
780 VLOG(2) << "Setting cudnn conv to use algorithm "
781 << best_algo.conv().algorithm() << " and "
782 << NumBytesToString(best_algo.scratch_bytes())
783 << " of scratch memory: " << instr->ToString()
784 << " tensor_ops_enabled: " << best_algo.conv().tensor_ops_enabled();
785
786 // Replace instr with a new CustomCall which has the correct algorithm, and
787 // whose output shape has the appropriate amount of scratch memory.
788 HloComputation* computation = instr->parent();
789 Shape new_call_shape = ShapeUtil::MakeTupleShape(
790 {instr->shape().tuple_shapes(0),
791 ShapeUtil::MakeShape(U8, {best_algo.scratch_bytes()})});
792
793 TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
794 instr->backend_config<CudnnConvBackendConfig>());
795 backend_config.set_algorithm(best_algo.conv().algorithm());
796 backend_config.set_tensor_ops_enabled(best_algo.conv().tensor_ops_enabled());
797
798 HloInstruction* new_call = computation->AddInstruction(
799 instr->CloneWithNewOperands(new_call_shape, instr->operands()));
800
801 VLOG(2) << "Replacing convolution " << instr->ToString() << " with "
802 << new_call->ToString();
803
804 TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
805
806 // Repackage new_call so it has the same shape as the original call, namely
807 // (conv_result, u8[0]).
808 HloInstruction* new_tuple =
809 computation->AddInstruction(HloInstruction::CreateTuple(
810 {computation->AddInstruction(HloInstruction::CreateGetTupleElement(
811 new_call_shape.tuple_shapes(0), new_call, 0)),
812 computation->AddInstruction(HloInstruction::CreateConstant(
813 LiteralUtil::CreateR1<uint8>({})))}));
814
815 TF_RETURN_IF_ERROR(instr->parent()->ReplaceInstruction(instr, new_tuple));
816 return true;
817 }
818
RunOnComputation(HloComputation * computation)819 StatusOr<bool> GpuConvAlgorithmPicker::RunOnComputation(
820 HloComputation* computation) {
821 std::vector<HloInstruction*> convs;
822 for (auto* instr : computation->instructions()) {
823 if (IsCustomCallToDnnConvolution(*instr)) {
824 convs.push_back(instr);
825 }
826 }
827
828 bool changed = false;
829 for (auto* instr : convs) {
830 TF_ASSIGN_OR_RETURN(bool result, RunOnInstruction(instr));
831 changed |= result;
832 }
833 return changed;
834 }
835
Run(HloModule * module)836 StatusOr<bool> GpuConvAlgorithmPicker::Run(HloModule* module) {
837 XLA_SCOPED_LOGGING_TIMER("GpuConvAlgorithmPicker");
838
839 if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
840 VLOG(2) << "Convolution auto-tuning disabled, GpuConvAlgorithmPicker "
841 "returning early.";
842 return false;
843 }
844
845 bool changed = false;
846 for (HloComputation* computation : module->MakeNonfusionComputations()) {
847 TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
848 changed |= result;
849 }
850
851 {
852 tensorflow::mutex_lock lock(autotune_cache_lock);
853 autotune_cache_stats.LogStats();
854 }
855
856 return changed;
857 }
858
859 } // namespace gpu
860 } // namespace xla
861