1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/device_description.h"
17 
18 #include <algorithm>
19 
20 #include "absl/strings/str_cat.h"
21 #include "tensorflow/stream_executor/lib/human_readable.h"
22 #include "tensorflow/stream_executor/lib/mathutil.h"
23 
24 namespace stream_executor {
25 
26 static const uint64 kUninitializedUint64 = -1ULL;
27 /* static */ const char *DeviceDescription::kUndefinedString = "<undefined>";
28 
DeviceDescription()29 DeviceDescription::DeviceDescription()
30     : device_vendor_(kUndefinedString),
31       platform_version_(kUndefinedString),
32       driver_version_(kUndefinedString),
33       runtime_version_(kUndefinedString),
34       pci_bus_id_(kUndefinedString),
35       name_(kUndefinedString),
36       thread_dim_limit_(kUninitializedUint64, kUninitializedUint64,
37                         kUninitializedUint64),
38       block_dim_limit_(kUninitializedUint64, kUninitializedUint64,
39                        kUninitializedUint64),
40       blocks_per_core_limit_(kUninitializedUint64),
41       threads_per_core_limit_(kUninitializedUint64),
42       threads_per_block_limit_(kUninitializedUint64),
43       threads_per_warp_(kUninitializedUint64),
44       registers_per_core_limit_(kUninitializedUint64),
45       registers_per_block_limit_(kUninitializedUint64),
46       device_address_bits_(kUninitializedUint64),
47       device_memory_size_(kUninitializedUint64),
48       memory_bandwidth_(kUninitializedUint64),
49       shared_memory_per_core_(kUninitializedUint64),
50       shared_memory_per_block_(kUninitializedUint64),
51       clock_rate_ghz_(-1.0),
52       cuda_compute_capability_major_(-1),
53       cuda_compute_capability_minor_(-1),
54       rocm_amdgpu_isa_version_(-1),
55       numa_node_(-1),
56       core_count_(-1),
57       ecc_enabled_(false) {}
58 
ToMap() const59 std::unique_ptr<std::map<string, string>> DeviceDescription::ToMap() const {
60   std::unique_ptr<std::map<string, string>> owned_result{
61       new std::map<string, string>};
62   std::map<string, string> &result = *owned_result;
63   result["Device Vendor"] = device_vendor();
64   result["Platform Version"] = platform_version();
65   result["Driver Version"] = driver_version();
66   result["Runtime Version"] = runtime_version();
67   result["PCI bus ID"] = pci_bus_id_;
68   result["Device Name"] = name_;
69 
70   const ThreadDim &thread_dim = thread_dim_limit();
71   result["ThreadDim Limit"] =
72       absl::StrCat(thread_dim.x, ",", thread_dim.y, ",", thread_dim.z);
73   const BlockDim &block_dim = block_dim_limit();
74   result["BlockDim Limit"] =
75       absl::StrCat(block_dim.x, ",", block_dim.y, ",", block_dim.z);
76 
77   result["Threads Per Core Limit"] = absl::StrCat(threads_per_core_limit());
78   result["Threads Per Block Limit"] = absl::StrCat(threads_per_block_limit());
79   result["Registers Per Block Limit"] =
80       absl::StrCat(registers_per_block_limit());
81 
82   result["Device Address Bits"] = absl::StrCat(device_address_bits());
83   result["Device Memory Size"] =
84       port::HumanReadableNumBytes::ToString(device_memory_size());
85   result["Memory Bandwidth"] = absl::StrCat(
86       port::HumanReadableNumBytes::ToString(memory_bandwidth_), "/s");
87 
88   result["Shared Memory Per Core"] =
89       port::HumanReadableNumBytes::ToString(shared_memory_per_core_);
90   result["Shared Memory Per Block"] =
91       port::HumanReadableNumBytes::ToString(shared_memory_per_block_);
92 
93   result["Clock Rate GHz"] = absl::StrCat(clock_rate_ghz());
94 
95   result["CUDA Compute Capability"] = absl::StrCat(
96       cuda_compute_capability_major_, ".", cuda_compute_capability_minor_);
97 
98   result["NUMA Node"] = absl::StrCat(numa_node());
99   result["Core Count"] = absl::StrCat(core_count());
100   result["ECC Enabled"] = absl::StrCat(ecc_enabled());
101   return owned_result;
102 }
103 
104 namespace internal {
105 
DeviceDescriptionBuilder()106 DeviceDescriptionBuilder::DeviceDescriptionBuilder()
107     : device_description_(new DeviceDescription) {}
108 
109 }  // namespace internal
110 
cuda_compute_capability(int * major,int * minor) const111 bool DeviceDescription::cuda_compute_capability(int *major, int *minor) const {
112   *major = cuda_compute_capability_major_;
113   *minor = cuda_compute_capability_minor_;
114   return cuda_compute_capability_major_ != 0;
115 }
116 
rocm_amdgpu_isa_version(int * version) const117 bool DeviceDescription::rocm_amdgpu_isa_version(int *version) const {
118   bool status = false;
119   if (rocm_amdgpu_isa_version_ > 0) {
120     *version = rocm_amdgpu_isa_version_;
121     status = true;
122   }
123   return status;
124 }
125 
ThreadDimOk(const DeviceDescription & device_description,const ThreadDim & thread_dim)126 bool ThreadDimOk(const DeviceDescription &device_description,
127                  const ThreadDim &thread_dim) {
128   auto total_threads = thread_dim.x * thread_dim.y * thread_dim.z;
129   auto threads_per_block_limit = device_description.threads_per_block_limit();
130   if (total_threads > threads_per_block_limit) {
131     VLOG(2) << "exceeded total-thread-per-block limit: " << total_threads
132             << " vs limit " << threads_per_block_limit;
133     return false;
134   }
135 
136   const auto &limit = device_description.thread_dim_limit();
137   bool ok = thread_dim.x <= limit.x && thread_dim.y <= limit.y &&
138             thread_dim.z <= limit.z;
139   if (!ok) {
140     VLOG(2) << "thread dim " << thread_dim.ToString()
141             << " exceeds limit contraints of " << limit.ToString();
142   }
143   return ok;
144 }
145 
DivideCeil(uint64 x,uint64 y)146 uint64 DivideCeil(uint64 x, uint64 y) {
147   return port::MathUtil::CeilOfRatio(x, y);
148 }
149 
CalculateDimensionality(const DeviceDescription & device_description,int64 element_count,int64 * threads_per_block,int64 * block_count)150 void CalculateDimensionality(const DeviceDescription &device_description,
151                              int64 element_count, int64 *threads_per_block,
152                              int64 *block_count) {
153   *threads_per_block = device_description.threads_per_block_limit();
154   *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
155   if (*block_count == 1) {
156     CHECK_LE(element_count, *threads_per_block);
157     *threads_per_block = element_count;
158   }
159 }
160 
161 }  // namespace stream_executor
162