1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Describes the underlying platform for a StreamExecutor; e.g. OpenCL or CUDA
17 // device and platform properties. Also contains convenience functions for
18 // checking/calculating launch dimensionality based on device properties.
19 
20 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
21 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
22 
23 #include <map>
24 #include <memory>
25 #include "absl/base/macros.h"
26 #include "tensorflow/stream_executor/launch_dim.h"
27 #include "tensorflow/stream_executor/platform/port.h"
28 
29 namespace stream_executor {
30 namespace internal {
31 class DeviceDescriptionBuilder;
32 }  // namespace internal
33 
34 // Data that describes the execution target of the StreamExecutor, in terms of
35 // important logical parameters. These include dimensionality limits and
36 // physical parameters of interest, such as number of cores present on the
37 // device.
38 //
39 // Thread-safe: immutable post-initialization.
40 class DeviceDescription {
41  public:
42   // Returns the platform being run on; this value is primarily intended for
43   // printing, and comes out something like "OpenCL 1.2" or "Compute Capability
44   // 3.5".
platform_version()45   const std::string &platform_version() const { return platform_version_; }
46 
47   // Returns the driver version interfacing with the underlying platform. Vendor
48   // dependent format.
driver_version()49   const std::string &driver_version() const { return driver_version_; }
50 
51   // Return the runtime version, if one is provided by the underlying platform.
52   // Vendor dependent format / usefulness.
runtime_version()53   const std::string &runtime_version() const { return runtime_version_; }
54 
55   // Returns the name that the device reports. Vendor dependent.
name()56   const std::string &name() const { return name_; }
57 
58   // Returns the PCI bus identifier for this device, of the form
59   // [domain]:[bus]:[device].[function]
pci_bus_id()60   const std::string &pci_bus_id() const { return pci_bus_id_; }
61 
62   // Returns the NUMA node associated with this device, for use in
63   // determining socket locality. If the NUMA node could not be determined, -1
64   // is returned.
numa_node()65   int numa_node() const { return numa_node_; }
66 
67   // Number of cores (traditional notion of core; i.e. an SM on an NVIDIA device
68   // or an AMD Compute Unit.
core_count()69   int core_count() const { return core_count_; }
70 
71   // Returns the limit on the thread dimensionality values in each of the
72   // respective dimensions. These limits affect what constitutes a legitimate
73   // kernel launch request.
thread_dim_limit()74   const ThreadDim &thread_dim_limit() const { return thread_dim_limit_; }
75 
76   // Returns the limit on the block dimensionality values in each of the
77   // respective dimensions. These limits may affect what constitutes a
78   // legitimate kernel launch request.
block_dim_limit()79   const BlockDim &block_dim_limit() const { return block_dim_limit_; }
80 
81   // Returns the limit on the total number of threads that can be launched in a
82   // single block; i.e. the limit on x * y * z dimensions of a ThreadDim.
83   // This limit affects what constitutes a legitimate kernel launch request.
threads_per_block_limit()84   const int64 &threads_per_block_limit() const {
85     return threads_per_block_limit_;
86   }
87 
88   // Returns the limit on the total number of threads that can be simultaneously
89   // launched on a given multiprocessor.
threads_per_core_limit()90   const int64 &threads_per_core_limit() const {
91     return threads_per_core_limit_;
92   }
93 
94   // Returns the number of threads per warp/wavefront.
threads_per_warp()95   const int64 &threads_per_warp() const { return threads_per_warp_; }
96 
97   // Returns the limit on the total number of registers per core.
registers_per_core_limit()98   const int64 &registers_per_core_limit() const {
99     return registers_per_core_limit_;
100   }
101 
102   // Returns the limit on the total number of registers that can be
103   // simultaneously used by a block.
registers_per_block_limit()104   const int64 &registers_per_block_limit() const {
105     return registers_per_block_limit_;
106   }
107 
108   // Returns the number of address bits available to kernel code running on the
109   // platform. This affects things like the maximum allocation size and perhaps
110   // types used in kernel code such as size_t.
device_address_bits()111   const int64 &device_address_bits() const { return device_address_bits_; }
112 
113   // Returns the device memory size in bytes.
device_memory_size()114   int64 device_memory_size() const { return device_memory_size_; }
115 
116   // Returns the device's memory bandwidth in bytes/sec.  (This is for
117   // reads/writes to/from the device's own memory, not for transfers between the
118   // host and device.)
memory_bandwidth()119   int64 memory_bandwidth() const { return memory_bandwidth_; }
120 
121   // Returns the device's core clock rate in GHz.
clock_rate_ghz()122   float clock_rate_ghz() const { return clock_rate_ghz_; }
123 
124   // Returns whether ECC is enabled.
ecc_enabled()125   bool ecc_enabled() const { return ecc_enabled_; }
126 
127   // Returns the device vendor string, e.g., "NVIDIA Corporation", "Advanced
128   // Micro Devices, Inc.", or "GenuineIntel".
device_vendor()129   const std::string &device_vendor() const { return device_vendor_; }
130 
131   // Returns the CUDA compute capability if we're running on the CUDA platform.
132   // If a CUDA compute capability is not available, the major version will be
133   // zero, and the return value will be false.
134   bool cuda_compute_capability(int *major, int *minor) const;
135 
136   // Returns the AMDGPU ISA version if we're running on the ROCm platform.
137   // If the information is not available, the version is not modified,
138   // and the return value will be false.
139   bool rocm_amdgpu_isa_version(int *version) const;
140 
141   // Returns the
142   // * AMDGPU GCN Architecture Name if we're running on the ROCm platform.
143   // * kUndefinedString otherwise
rocm_amdgpu_gcn_arch_name()144   const std::string rocm_amdgpu_gcn_arch_name() const {
145     return rocm_amdgpu_gcn_arch_name_;
146   }
147 
148   // Returns the maximum amount of shared memory present on a single core
149   // (i.e. Streaming Multiprocessor on NVIDIA GPUs; Compute Unit for OpenCL
150   // devices). Note that some devices, such as NVIDIA's have a configurable
151   // partitioning between shared memory and L1 cache.
shared_memory_per_core()152   int64 shared_memory_per_core() const { return shared_memory_per_core_; }
153 
154   // Returns the maximum amount of shared memory available for a single block.
shared_memory_per_block()155   int64 shared_memory_per_block() const { return shared_memory_per_block_; }
156 
157   // TODO(leary): resident blocks per core will be useful.
158 
159   // Convenience typedef for the string-based DeviceDescription mapping.
160   typedef std::map<std::string, std::string> Map;
161 
162   // Returns a mapping from readable names to readable values that describe the
163   // device. This is useful for things like printing.
164   std::unique_ptr<Map> ToMap() const;
165 
166   // For string values that are not available via the underlying platform, this
167   // value will be provided.
168   static const char *kUndefinedString;
169 
170  private:
171   friend class internal::DeviceDescriptionBuilder;
172 
173   DeviceDescription();
174 
175   // For description of the following members, see the corresponding accessor
176   // above.
177   //
178   // N.B. If another field is added, update ToMap() above.
179   std::string device_vendor_;
180   std::string platform_version_;
181   std::string driver_version_;
182   std::string runtime_version_;
183   std::string pci_bus_id_;
184   std::string name_;
185 
186   ThreadDim thread_dim_limit_;
187   BlockDim block_dim_limit_;
188 
189   int64 threads_per_core_limit_;
190   int64 threads_per_block_limit_;
191   int64 threads_per_warp_;
192 
193   int64 registers_per_core_limit_;
194   int64 registers_per_block_limit_;
195 
196   int64 device_address_bits_;
197   int64 device_memory_size_;
198   int64 memory_bandwidth_;
199 
200   // Shared memory limits on a given device.
201   int64 shared_memory_per_core_;
202   int64 shared_memory_per_block_;
203 
204   float clock_rate_ghz_;
205 
206   // CUDA "CC" major value, -1 if not available.
207   int cuda_compute_capability_major_;
208   int cuda_compute_capability_minor_;
209 
210   // ROCM AMDGPU ISA version, 0 if not available.
211   int rocm_amdgpu_isa_version_;
212 
213   // ROCm AMDGPU GCN Architecture name, "" if not available.
214   std::string rocm_amdgpu_gcn_arch_name_;
215 
216   int numa_node_;
217   int core_count_;
218   bool ecc_enabled_;
219 
220   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescription);
221 };
222 
223 namespace internal {
224 
225 // Helper class the builds a device description, given that it has a large
226 // number of fields that would be easily confused in constructor form.
227 class DeviceDescriptionBuilder {
228  public:
229   DeviceDescriptionBuilder();
230 
231   // For descriptions of the following fields, see comments on the corresponding
232   // DeviceDescription::* accessors above.
233 
set_device_vendor(const std::string & value)234   void set_device_vendor(const std::string &value) {
235     device_description_->device_vendor_ = value;
236   }
set_platform_version(const std::string & value)237   void set_platform_version(const std::string &value) {
238     device_description_->platform_version_ = value;
239   }
set_driver_version(const std::string & value)240   void set_driver_version(const std::string &value) {
241     device_description_->driver_version_ = value;
242   }
set_runtime_version(const std::string & value)243   void set_runtime_version(const std::string &value) {
244     device_description_->runtime_version_ = value;
245   }
set_pci_bus_id(const std::string & value)246   void set_pci_bus_id(const std::string &value) {
247     device_description_->pci_bus_id_ = value;
248   }
set_name(const std::string & value)249   void set_name(const std::string &value) {
250     device_description_->name_ = value;
251   }
252 
set_thread_dim_limit(const ThreadDim & value)253   void set_thread_dim_limit(const ThreadDim &value) {
254     device_description_->thread_dim_limit_ = value;
255   }
set_block_dim_limit(const BlockDim & value)256   void set_block_dim_limit(const BlockDim &value) {
257     device_description_->block_dim_limit_ = value;
258   }
259 
set_threads_per_core_limit(int64 value)260   void set_threads_per_core_limit(int64 value) {
261     device_description_->threads_per_core_limit_ = value;
262   }
set_threads_per_block_limit(int64 value)263   void set_threads_per_block_limit(int64 value) {
264     device_description_->threads_per_block_limit_ = value;
265   }
set_threads_per_warp(int64 value)266   void set_threads_per_warp(int64 value) {
267     device_description_->threads_per_warp_ = value;
268   }
269 
set_registers_per_core_limit(int64 value)270   void set_registers_per_core_limit(int64 value) {
271     device_description_->registers_per_core_limit_ = value;
272   }
set_registers_per_block_limit(int64 value)273   void set_registers_per_block_limit(int64 value) {
274     device_description_->registers_per_block_limit_ = value;
275   }
276 
set_device_address_bits(int64 value)277   void set_device_address_bits(int64 value) {
278     device_description_->device_address_bits_ = value;
279   }
set_device_memory_size(int64 value)280   void set_device_memory_size(int64 value) {
281     device_description_->device_memory_size_ = value;
282   }
set_memory_bandwidth(int64 value)283   void set_memory_bandwidth(int64 value) {
284     device_description_->memory_bandwidth_ = value;
285   }
286 
set_shared_memory_per_core(int64 value)287   void set_shared_memory_per_core(int64 value) {
288     device_description_->shared_memory_per_core_ = value;
289   }
set_shared_memory_per_block(int64 value)290   void set_shared_memory_per_block(int64 value) {
291     device_description_->shared_memory_per_block_ = value;
292   }
293 
set_clock_rate_ghz(float value)294   void set_clock_rate_ghz(float value) {
295     device_description_->clock_rate_ghz_ = value;
296   }
297 
set_cuda_compute_capability(int major,int minor)298   void set_cuda_compute_capability(int major, int minor) {
299     device_description_->cuda_compute_capability_major_ = major;
300     device_description_->cuda_compute_capability_minor_ = minor;
301   }
302 
set_rocm_amdgpu_isa_version(int version)303   void set_rocm_amdgpu_isa_version(int version) {
304     device_description_->rocm_amdgpu_isa_version_ = version;
305   }
306 
set_rocm_amdgpu_gcn_arch_name(const std::string & gcn_arch_name)307   void set_rocm_amdgpu_gcn_arch_name(const std::string &gcn_arch_name) {
308     device_description_->rocm_amdgpu_gcn_arch_name_ = gcn_arch_name;
309   }
310 
set_numa_node(int value)311   void set_numa_node(int value) { device_description_->numa_node_ = value; }
set_core_count(int value)312   void set_core_count(int value) { device_description_->core_count_ = value; }
set_ecc_enabled(bool value)313   void set_ecc_enabled(bool value) {
314     device_description_->ecc_enabled_ = value;
315   }
316 
317   // Returns a built DeviceDescription with ownership transferred to the
318   // caller. There are currently no restrictions on which fields must be set in
319   // order to build the descriptor.
320   //
321   // Once the description is built, this builder object should be discarded.
Build()322   std::unique_ptr<DeviceDescription> Build() {
323     return std::move(device_description_);
324   }
325 
326  private:
327   std::unique_ptr<DeviceDescription> device_description_;
328 
329   SE_DISALLOW_COPY_AND_ASSIGN(DeviceDescriptionBuilder);
330 };
331 
332 }  // namespace internal
333 
334 // Returns whether the given thread_dim is acceptable given the limits described
335 // in device_description. For detailed reasons for failing the predicate, enable
336 // VLOG(2) for this module.
337 bool ThreadDimOk(const DeviceDescription &device_description,
338                  const ThreadDim &thread_dim);
339 
340 // Equivalent to ceil(double(element_count) / threads_per_block).
341 ABSL_DEPRECATED("Use MathUtil::CeilOfRatio directly instead.")
342 int64 DivideCeil(int64 x, int64 y);
343 
344 // Calculate the number of threads/blocks required to process element_count
345 // elements. Note that you can still end up with more threads than
346 // element_count due to rounding, so kernels often start with an "is this
347 // thread id in the element_count range?" test.
348 void CalculateDimensionality(const DeviceDescription &device_description,
349                              int64 element_count, int64 *threads_per_block,
350                              int64 *block_count);
351 
352 }  // namespace stream_executor
353 
354 #endif  // TENSORFLOW_STREAM_EXECUTOR_DEVICE_DESCRIPTION_H_
355