1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/profiler/utils/hardware_type_utils.h" 17 18 #include "absl/strings/match.h" 19 #include "tensorflow/core/platform/logging.h" 20 #include "tensorflow/core/platform/types.h" 21 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h" 22 23 namespace tensorflow { 24 namespace profiler { 25 namespace { 26 27 // Get theoretical upperbound of single precision FMA throughput of the GPU per 28 // cycle per streaming multiprocessor. 29 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions__throughput-native-arithmetic-instructions GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities & device_cap)30uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) { 31 uint32 n_fp32_cores = 0; 32 uint32 n_tc_cores = 0; 33 switch (device_cap.compute_capability().major()) { 34 case 2: 35 // Fermi 36 n_fp32_cores = 32; 37 break; 38 case 3: 39 // Kepler 40 n_fp32_cores = 192; 41 break; 42 case 5: 43 // Maxwell 44 n_fp32_cores = 128; 45 break; 46 case 6: 47 // Pascal 48 if (device_cap.compute_capability().minor() > 0) { 49 // Pascal SM61/62 50 n_fp32_cores = 128; 51 } else { 52 // Pascal SM60 53 n_fp32_cores = 64; 54 } 55 break; 56 case 7: 57 // Volta and Turing 58 n_fp32_cores = 64; 59 n_tc_cores = 8; 60 break; 61 case 8: 62 // Ampere 63 if (device_cap.compute_capability().minor() >= 6) { 64 // Ampere SM86 65 n_fp32_cores = 128; 66 } else { 67 // Ampere SM80 68 n_fp32_cores = 64; 69 } 70 n_tc_cores = 4; 71 break; 72 default: 73 LOG(ERROR) << "Invalid GPU compute capability."; 74 break; 75 } 76 // GPU TensorCore can execute 64 FMAs per cycle. 77 // https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/ 78 return n_fp32_cores + n_tc_cores * 64; 79 } 80 81 } // namespace 82 GetFlopMaxThroughputPerSM(const DeviceCapabilities & device_cap)83double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap) { 84 // One FMA = 2 floating point operations, one multiply and one add. 85 return GetFmaMaxThroughputPerSMPerCycle(device_cap) * 2 * 86 device_cap.clock_rate_in_ghz(); 87 } 88 GpuModelName(const DeviceCapabilities & device_cap)89absl::string_view GpuModelName(const DeviceCapabilities& device_cap) { 90 switch (device_cap.compute_capability().major()) { 91 case 2: 92 return "Nvidia GPU (Fermi)"; 93 case 3: 94 return "Nvidia GPU (Kepler)"; 95 case 5: 96 return "Nvidia GPU (Maxwell)"; 97 case 6: 98 return "Nvidia GPU (Pascal)"; 99 case 7: 100 if (device_cap.compute_capability().minor() < 5) { 101 return "Nvidia GPU (Volta)"; 102 } else { 103 return "Nvidia GPU (Turing)"; 104 } 105 case 8: 106 return "Nvidia GPU (Ampere)"; 107 default: 108 return "Nvidia GPU"; 109 } 110 } 111 ParseHardwareType(absl::string_view device_type)112HardwareType ParseHardwareType(absl::string_view device_type) { 113 if (absl::StrContains(device_type, "GPU")) return HardwareType::GPU; 114 if (device_type == "CPU") return HardwareType::CPU_ONLY; 115 if (device_type == "TPU") return HardwareType::TPU; 116 return HardwareType::UNKNOWN_HARDWARE; 117 } 118 HasDevice(HardwareType x)119bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; } 120 121 } // namespace profiler 122 } // namespace tensorflow 123