1 // REQUIRES: x86-registered-target, nvptx-registered-target, amdgpu-registered-target
2 
3 // By default CUDA uses -ffp-contract=fast, HIP uses -ffp-contract=fast-honor-pragmas.
4 // we should fuse multiply/add into fma instruction.
5 // In IR, fmul/fadd instructions with contract flag are emitted.
6 // In backend
7 //    nvptx -  assumes fast fp fuse option, which fuses
8 //             mult/add insts disregarding contract flag and
9 //             llvm.fmuladd intrinsics.
10 //    amdgcn - assumes standard fp fuse option, which only
11 //             fuses mult/add insts with contract flag and
12 //             llvm.fmuladd intrinsics.
13 
14 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
15 // RUN:   -disable-llvm-passes -o - %s \
16 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
17 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
18 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
19 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
20 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
21 // RUN:   -O3 -o - %s \
22 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
23 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
24 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
25 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
26 
27 // Check separate compile/backend steps corresponding to -save-temps.
28 
29 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
30 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
31 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
32 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
33 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
34 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
35 
36 // Explicit -ffp-contract=fast
37 // In IR, fmul/fadd instructions with contract flag are emitted.
38 // In backend
39 //    nvptx/amdgcn - assumes fast fp fuse option, which fuses
40 //                   mult/add insts disregarding contract flag and
41 //                   llvm.fmuladd intrinsics.
42 
43 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
44 // RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
45 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
46 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
47 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
48 // RUN:   -ffp-contract=fast \
49 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
50 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
51 // RUN:   -O3 -o - %s \
52 // RUN:   -ffp-contract=fast \
53 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FAST %s
54 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
55 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
56 // RUN:   -ffp-contract=fast \
57 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST %s
58 
59 // Check separate compile/backend steps corresponding to -save-temps.
60 // When input is IR, -ffp-contract has no effect. Backend uses default
61 // default FP fuse option.
62 
63 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
64 // RUN:   -ffp-contract=fast \
65 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
66 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
67 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
68 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
69 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
70 
71 // Explicit -ffp-contract=fast-honor-pragmas
72 // In IR, fmul/fadd instructions with contract flag are emitted.
73 // In backend
74 //    nvptx/amdgcn - assumes standard fp fuse option, which only
75 //                   fuses mult/add insts with contract flag or
76 //                   llvm.fmuladd intrinsics.
77 
78 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
79 // RUN:   -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \
80 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
81 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
82 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
83 // RUN:   -ffp-contract=fast-honor-pragmas \
84 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
85 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
86 // RUN:   -O3 -o - %s \
87 // RUN:   -ffp-contract=fast-honor-pragmas \
88 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-FASTSTD %s
89 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
90 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
91 // RUN:   -ffp-contract=fast-honor-pragmas \
92 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
93 
94 // Check separate compile/backend steps corresponding to -save-temps.
95 // When input is IR, -ffp-contract has no effect. Backend uses default
96 // default FP fuse option.
97 
98 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
99 // RUN:   -ffp-contract=fast-honor-pragmas \
100 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
101 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-FAST-IR %s
102 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
103 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
104 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-FASTSTD %s
105 
106 // Explicit -ffp-contract=on -- fusing by front-end.
107 // In IR,
108 //    mult/add in the same statement - llvm.fmuladd instrinsic emitted
109 //    mult/add in different statement -  fmul/fadd instructions without
110 //                                       contract flag are emitted.
111 // In backend
112 //    nvptx/amdgcn - assumes standard fp fuse option, which only
113 //                   fuses mult/add insts with contract flag or
114 //                   llvm.fmuladd intrinsics.
115 
116 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
117 // RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
118 // RUN:   | FileCheck -check-prefixes=COMMON,NV-ON %s
119 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
120 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
121 // RUN:   -ffp-contract=on \
122 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-ON %s
123 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
124 // RUN:   -O3 -o - %s \
125 // RUN:   -ffp-contract=on \
126 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-ON %s
127 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
128 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
129 // RUN:   -ffp-contract=on \
130 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
131 
132 // Check separate compile/backend steps corresponding to -save-temps.
133 
134 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
135 // RUN:   -ffp-contract=on \
136 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
137 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-ON-IR %s
138 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
139 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
140 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-ON %s
141 
142 // Explicit -ffp-contract=off should disable instruction fusing.
143 // In IR, fmul/fadd instructions without contract flag are emitted.
144 // In backend
145 //    nvptx/amdgcn - assumes standard fp fuse option, which only
146 //                   fuses mult/add insts with contract flag or
147 //                   llvm.fmuladd intrinsics.
148 
149 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
150 // RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
151 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OFF %s
152 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
153 // RUN:   -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
154 // RUN:   -ffp-contract=off \
155 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OFF %s
156 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
157 // RUN:   -O3 -o - %s \
158 // RUN:   -ffp-contract=off \
159 // RUN:   | FileCheck -check-prefixes=COMMON,NV-OPT-OFF %s
160 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
161 // RUN:   -O3 -target-cpu gfx906 -o - -x hip %s \
162 // RUN:   -ffp-contract=off \
163 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
164 
165 // Check separate compile/backend steps corresponding to -save-temps.
166 
167 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
168 // RUN:   -ffp-contract=off \
169 // RUN:   -O3 -disable-llvm-passes -target-cpu gfx906 -o %t.ll -x hip %s
170 // RUN: cat %t.ll  | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF-IR %s
171 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S \
172 // RUN:   -O3 -target-cpu gfx906 -o - -x ir %t.ll \
173 // RUN:   | FileCheck -check-prefixes=COMMON,AMD-OPT-OFF %s
174 
175 #include "Inputs/cuda.h"
176 
177 // Test multiply/add in the same statement, which can be emitted as FMA when
178 // fp-contract is on or fast.
func(float a,float b,float c)179 __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
180 // COMMON-LABEL: _Z4funcfff
181 // NV-ON:       fma.rn.f32
182 // NV-ON-NEXT:  st.param.f32
183 // AMD-ON:       v_fmac_f32_e64
184 // AMD-ON-NEXT:  s_setpc_b64
185 
186 // NV-OFF:      mul.rn.f32
187 // NV-OFF-NEXT: add.rn.f32
188 // NV-OFF-NEXT: st.param.f32
189 // AMD-OFF:      v_mul_f32_e64
190 // AMD-OFF-NEXT: v_add_f32_e64
191 // AMD-OFF-NEXT: s_setpc_b64
192 
193 // NV-OPT-FAST: fma.rn.f32
194 // NV-OPT-FAST-NEXT: st.param.f32
195 // NV-OPT-FASTSTD: fma.rn.f32
196 // NV-OPT-FASTSTD-NEXT: st.param.f32
197 // NV-OPT-ON: fma.rn.f32
198 // NV-OPT-ON-NEXT: st.param.f32
199 // NV-OPT-OFF: mul.rn.f32
200 // NV-OPT-OFF-NEXT: add.rn.f32
201 // NV-OPT-OFF-NEXT: st.param.f32
202 
203 // AMD-OPT-FAST-IR: fmul contract float
204 // AMD-OPT-FAST-IR: fadd contract float
205 // AMD-OPT-ON-IR: @llvm.fmuladd.f32
206 // AMD-OPT-OFF-IR: fmul float
207 // AMD-OPT-OFF-IR: fadd float
208 
209 // AMD-OPT-FAST: v_fmac_f32_e32
210 // AMD-OPT-FAST-NEXT: s_setpc_b64
211 // AMD-OPT-FASTSTD: v_fmac_f32_e32
212 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
213 // AMD-OPT-ON: v_fmac_f32_e32
214 // AMD-OPT-ON-NEXT: s_setpc_b64
215 // AMD-OPT-OFF: v_mul_f32_e32
216 // AMD-OPT-OFF-NEXT: v_add_f32_e32
217 // AMD-OPT-OFF-NEXT: s_setpc_b64
218 
219 // Test multiply/add in the different statements, which can be emitted as
220 // FMA when fp-contract is fast but not on.
func2(float a,float b,float c)221 __host__ __device__ float func2(float a, float b, float c) {
222   float t = b * c;
223   return t + a;
224 }
225 // COMMON-LABEL: _Z5func2fff
226 // NV-OPT-FAST: fma.rn.f32
227 // NV-OPT-FAST-NEXT: st.param.f32
228 // NV-OPT-FASTSTD: fma.rn.f32
229 // NV-OPT-FASTSTD-NEXT: st.param.f32
230 // NV-OPT-ON: mul.rn.f32
231 // NV-OPT-ON: add.rn.f32
232 // NV-OPT-ON-NEXT: st.param.f32
233 // NV-OPT-OFF: mul.rn.f32
234 // NV-OPT-OFF: add.rn.f32
235 // NV-OPT-OFF-NEXT: st.param.f32
236 
237 // AMD-OPT-FAST-IR: fmul contract float
238 // AMD-OPT-FAST-IR: fadd contract float
239 // AMD-OPT-ON-IR: fmul float
240 // AMD-OPT-ON-IR: fadd float
241 // AMD-OPT-OFF-IR: fmul float
242 // AMD-OPT-OFF-IR: fadd float
243 
244 // AMD-OPT-FAST: v_fmac_f32_e32
245 // AMD-OPT-FAST-NEXT: s_setpc_b64
246 // AMD-OPT-FASTSTD: v_fmac_f32_e32
247 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
248 // AMD-OPT-ON: v_mul_f32_e32
249 // AMD-OPT-ON-NEXT: v_add_f32_e32
250 // AMD-OPT-ON-NEXT: s_setpc_b64
251 // AMD-OPT-OFF: v_mul_f32_e32
252 // AMD-OPT-OFF-NEXT: v_add_f32_e32
253 // AMD-OPT-OFF-NEXT: s_setpc_b64
254 
255 // Test multiply/add in the different statements, which is forced
256 // to be compiled with fp contract on. fmul/fadd without contract
257 // flags are emitted in IR. In nvptx, they are emitted as FMA in
258 // fp-contract is fast but not on, as nvptx backend uses the same
259 // fp fuse option as front end, whereas fast fp fuse option in
260 // backend fuses fadd/fmul disregarding contract flag. In amdgcn
261 // they are not fused as amdgcn always use standard fp fusion
262 // option which respects contract flag.
func3(float a,float b,float c)263   __host__ __device__ float func3(float a, float b, float c) {
264 #pragma clang fp contract(on)
265   float t = b * c;
266   return t + a;
267 }
268 // COMMON-LABEL: _Z5func3fff
269 // NV-OPT-FAST: fma.rn.f32
270 // NV-OPT-FAST-NEXT: st.param.f32
271 // NV-OPT-FASTSTD: mul.rn.f32
272 // NV-OPT-FASTSTD: add.rn.f32
273 // NV-OPT-FASTSTD-NEXT: st.param.f32
274 // NV-OPT-ON: mul.rn.f32
275 // NV-OPT-ON: add.rn.f32
276 // NV-OPT-ON-NEXT: st.param.f32
277 // NV-OPT-OFF: mul.rn.f32
278 // NV-OPT-OFF: add.rn.f32
279 // NV-OPT-OFF-NEXT: st.param.f32
280 
281 // AMD-OPT-FAST-IR: fmul float
282 // AMD-OPT-FAST-IR: fadd float
283 // AMD-OPT-ON-IR: fmul float
284 // AMD-OPT-ON-IR: fadd float
285 // AMD-OPT-OFF-IR: fmul float
286 // AMD-OPT-OFF-IR: fadd float
287 
288 // AMD-OPT-FAST: v_fmac_f32_e32
289 // AMD-OPT-FAST-NEXT: s_setpc_b64
290 // AMD-OPT-FASTSTD: v_mul_f32_e32
291 // AMD-OPT-FASTSTD-NEXT: v_add_f32_e32
292 // AMD-OPT-FASTSTD-NEXT: s_setpc_b64
293 // AMD-OPT-ON: v_mul_f32_e32
294 // AMD-OPT-ON-NEXT: v_add_f32_e32
295 // AMD-OPT-ON-NEXT: s_setpc_b64
296 // AMD-OPT-OFF: v_mul_f32_e32
297 // AMD-OPT-OFF-NEXT: v_add_f32_e32
298 // AMD-OPT-OFF-NEXT: s_setpc_b64
299