1 // Tests that ptxas and fatbinary are invoked correctly during CUDA 2 // compilation. 3 // 4 // REQUIRES: clang-driver 5 // REQUIRES: x86-registered-target 6 // REQUIRES: nvptx-registered-target 7 8 // Regular compiles with -O{0,1,2,3,4,fast}. -O4 and -Ofast map to ptxas O3. 9 // RUN: %clang -### -target x86_64-linux-gnu -O0 -c %s 2>&1 \ 10 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s 11 // RUN: %clang -### -target x86_64-linux-gnu -O1 -c %s 2>&1 \ 12 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT1 %s 13 // RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \ 14 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT2 %s 15 // RUN: %clang -### -target x86_64-linux-gnu -O3 -c %s 2>&1 \ 16 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s 17 // RUN: %clang -### -target x86_64-linux-gnu -O4 -c %s 2>&1 \ 18 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s 19 // RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \ 20 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s 21 // Generating relocatable device code 22 // RUN: %clang -### -target x86_64-linux-gnu -fgpu-rdc -c %s 2>&1 \ 23 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s 24 25 // With debugging enabled, ptxas should be run with with no ptxas optimizations. 26 // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -g -c %s 2>&1 \ 27 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,DBG %s 28 29 // --no-cuda-noopt-device-debug overrides --cuda-noopt-device-debug. 30 // RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug \ 31 // RUN: --no-cuda-noopt-device-debug -O2 -c %s 2>&1 \ 32 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT2 %s 33 34 // Regular compile without -O. This should result in us passing -O0 to ptxas. 35 // RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \ 36 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s 37 38 // Regular compiles with -Os and -Oz. For lack of a better option, we map 39 // these to ptxas -O3. 40 // RUN: %clang -### -target x86_64-linux-gnu -Os -c %s 2>&1 \ 41 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT2 %s 42 // RUN: %clang -### -target x86_64-linux-gnu -Oz -c %s 2>&1 \ 43 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT2 %s 44 45 // Regular compile targeting sm_35. 46 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \ 47 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s 48 // Separate compilation targeting sm_35. 49 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \ 50 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s 51 52 // 32-bit compile. 53 // RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \ 54 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s 55 // 32-bit compile when generating relocatable device code. 56 // RUN: %clang -### -target i386-linux-gnu -fgpu-rdc -c %s 2>&1 \ 57 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s 58 59 // Compile with -fintegrated-as. This should still cause us to invoke ptxas. 60 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \ 61 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s 62 // Check that we still pass -c when generating relocatable device code. 63 // RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \ 64 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s 65 66 // Check -Xcuda-ptxas and -Xcuda-fatbinary 67 // RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \ 68 // RUN: -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \ 69 // RUN: | FileCheck -check-prefixes=CHECK,SM20,PTXAS-EXTRA,FATBINARY-EXTRA %s 70 71 // MacOS spot-checks 72 // RUN: %clang -### -target x86_64-apple-macosx -O0 -c %s 2>&1 \ 73 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s 74 // RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -c %s 2>&1 \ 75 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s 76 // RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \ 77 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s 78 79 // Check relocatable device code generation on MacOS. 80 // RUN: %clang -### -target x86_64-apple-macosx -O0 -fgpu-rdc -c %s 2>&1 \ 81 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s 82 // RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fgpu-rdc -c %s 2>&1 \ 83 // RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s 84 // RUN: %clang -### -target i386-apple-macosx -fgpu-rdc -c %s 2>&1 \ 85 // RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s 86 87 // Check that CLANG forwards the -v flag to PTXAS. 88 // RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \ 89 // RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s 90 91 // Match clang job that produces PTX assembly. 92 // CHECK: "-cc1" 93 // ARCH64-SAME: "-triple" "nvptx64-nvidia-cuda" 94 // ARCH32-SAME: "-triple" "nvptx-nvidia-cuda" 95 // RDC-SAME: "-fgpu-rdc" 96 // CHECK-NOT: "-fgpu-rdc" 97 // SM20-SAME: "-target-cpu" "sm_20" 98 // SM35-SAME: "-target-cpu" "sm_35" 99 // SM20-SAME: "-o" "[[PTXFILE:[^"]*]]" 100 // SM35-SAME: "-o" "[[PTXFILE:[^"]*]]" 101 102 // Match the call to ptxas (which assembles PTX to SASS). 103 // CHECK: ptxas 104 // ARCH64-SAME: "-m64" 105 // ARCH32-SAME: "-m32" 106 // OPT0-SAME: "-O0" 107 // OPT0-NOT: "-g" 108 // OPT1-SAME: "-O1" 109 // OPT1-NOT: "-g" 110 // OPT2-SAME: "-O2" 111 // OPT2-NOT: "-g" 112 // OPT3-SAME: "-O3" 113 // OPT3-NOT: "-g" 114 // DBG-SAME: "-g" "--dont-merge-basicblocks" "--return-at-end" 115 // SM20-SAME: "--gpu-name" "sm_20" 116 // SM35-SAME: "--gpu-name" "sm_35" 117 // SM20-SAME: "--output-file" "[[CUBINFILE:[^"]*]]" 118 // SM35-SAME: "--output-file" "[[CUBINFILE:[^"]*]]" 119 // CHECK-SAME: "[[PTXFILE]]" 120 // PTXAS-EXTRA-SAME: "-foo1" 121 // PTXAS-EXTRA-SAME: "-foo2" 122 // RDC-SAME: "-c" 123 // CHECK-NOT: "-c" 124 125 // Match the call to fatbinary (which combines all our PTX and SASS into one 126 // blob). 127 // CHECK: fatbinary 128 // CHECK-SAME-DAG: "--cuda" 129 // ARCH64-SAME-DAG: "-64" 130 // ARCH32-SAME-DAG: "-32" 131 // CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]" 132 // SM20-SAME-DAG: "--image=profile=compute_20,file=[[PTXFILE]]" 133 // SM35-SAME-DAG: "--image=profile=compute_35,file=[[PTXFILE]]" 134 // SM20-SAME-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]" 135 // SM35-SAME-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]" 136 // FATBINARY-EXTRA-SAME: "-bar1" 137 // FATBINARY-EXTRA-SAME: "-bar2" 138 139 // Match the clang job for host compilation. 140 // CHECK: "-cc1" 141 // ARCH64-SAME: "-triple" "x86_64- 142 // ARCH32-SAME: "-triple" "i386- 143 // CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" 144 // RDC-SAME: "-fgpu-rdc" 145 // CHECK-NOT: "-fgpu-rdc" 146 147 // CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v" 148