1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
17 
18 #include "absl/strings/match.h"
19 #include "tensorflow/core/platform/logging.h"
20 #include "tensorflow/core/platform/types.h"
21 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
22 
23 namespace tensorflow {
24 namespace profiler {
25 namespace {
26 
27 // Get theoretical upperbound of single precision FMA throughput of the GPU per
28 // cycle per streaming multiprocessor.
29 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions
GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities & device_cap)30 uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
31   uint32 n_fp32_cores = 0;
32   uint32 n_tc_cores = 0;
33   switch (device_cap.compute_capability().major()) {
34     case 2:
35       // Fermi
36       n_fp32_cores = 32;
37       break;
38     case 3:
39       // Kepler
40       n_fp32_cores = 192;
41       break;
42     case 5:
43       // Maxwell
44       n_fp32_cores = 128;
45       break;
46     case 6:
47       // Pascal
48       if (device_cap.compute_capability().minor() > 0) {
49         // Pascal SM61/62
50         n_fp32_cores = 128;
51       } else {
52         // Pascal SM60
53         n_fp32_cores = 64;
54       }
55       break;
56     case 7:
57       // Volta and Turing
58       n_fp32_cores = 64;
59       n_tc_cores = 8;
60       break;
61     case 8:
62       // Ampere
63       if (device_cap.compute_capability().minor() >= 6) {
64         // Ampere SM86
65         n_fp32_cores = 128;
66       } else {
67         // Ampere SM80
68         n_fp32_cores = 64;
69       }
70       n_tc_cores = 4;
71       break;
72     default:
73       LOG(ERROR) << "Invalid GPU compute capability.";
74       break;
75   }
76   // GPU TensorCore can execute 64 FMAs per cycle.
77   // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
78   return n_fp32_cores + n_tc_cores * 64;
79 }
80 
81 }  // namespace
82 
GetFlopMaxThroughputPerSM(const DeviceCapabilities & device_cap)83 double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) {
84   // One FMA = 2 floating point operations, one multiply and one add.
85   return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 *
86          device_cap.clock_rate_in_ghz();
87 }
88 
GpuModelName(const DeviceCapabilities & device_cap)89 absl::string_view GpuModelName(const DeviceCapabilities& device_cap) {
90   switch (device_cap.compute_capability().major()) {
91     case 2:
92       return "Nvidia GPU (Fermi)";
93     case 3:
94       return "Nvidia GPU (Kepler)";
95     case 5:
96       return "Nvidia GPU (Maxwell)";
97     case 6:
98       return "Nvidia GPU (Pascal)";
99     case 7:
100       if (device_cap.compute_capability().minor() < 5) {
101         return "Nvidia GPU (Volta)";
102       } else {
103         return "Nvidia GPU (Turing)";
104       }
105     case 8:
106       return "Nvidia GPU (Ampere)";
107     default:
108       return "Nvidia GPU";
109   }
110 }
111 
ParseHardwareType(absl::string_view device_type)112 HardwareType ParseHardwareType(absl::string_view device_type) {
113   if (absl::StrContains(device_type, "GPU")) return HardwareType::GPU;
114   if (device_type == "CPU") return HardwareType::CPU_ONLY;
115   if (device_type == "TPU") return HardwareType::TPU;
116   return HardwareType::UNKNOWN_HARDWARE;
117 }
118 
HasDevice(HardwareType x)119 bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
120 
121 }  // namespace profiler
122 }  // namespace tensorflow
123