1; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
2; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
3
4; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
5; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
6
7; FIXME: This should also fold when fma is actually fast if an FMA
8; exists in the original program.
9
10; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
11
12; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
13; GCN: buffer_load_dword [[X:v[0-9]+]]
14; GCN: buffer_load_dword [[Y:v[0-9]+]]
15; GCN: buffer_load_dword [[Z:v[0-9]+]]
16; GCN: buffer_load_dword [[U:v[0-9]+]]
17; GCN: buffer_load_dword [[V:v[0-9]+]]
18
19; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
20; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
21; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
22
23; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
24; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
25; GCN-FASTFMA: buffer_store_dword [[FMA1]]
26
27; GCN-SLOWFMA: v_mul_f32_e32
28; GCN-SLOWFMA: v_mul_f32_e32
29; GCN-SLOWFMA: v_add_f32_e32
30; GCN-SLOWFMA: v_add_f32_e32
31define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
32  %x = load volatile float, float addrspace(1)* undef
33  %y = load volatile float, float addrspace(1)* undef
34  %z = load volatile float, float addrspace(1)* undef
35  %u = load volatile float, float addrspace(1)* undef
36  %v = load volatile float, float addrspace(1)* undef
37  %mul.u.v = fmul fast float %u, %v
38  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
39  %add = fadd fast float %fma, %z
40  store volatile float %add, float addrspace(1)* undef
41  ret void
42}
43
44; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
45; GCN: buffer_load_dword [[X:v[0-9]+]]
46; GCN: buffer_load_dword [[Y:v[0-9]+]]
47; GCN: buffer_load_dword [[Z:v[0-9]+]]
48; GCN: buffer_load_dword [[U:v[0-9]+]]
49; GCN: buffer_load_dword [[V:v[0-9]+]]
50
51; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
52; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
53; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
54
55; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
56; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
57; GCN-FASTFMA: buffer_store_dword [[FMA1]]
58define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
59  %x = load volatile float, float addrspace(1)* undef
60  %y = load volatile float, float addrspace(1)* undef
61  %z = load volatile float, float addrspace(1)* undef
62  %u = load volatile float, float addrspace(1)* undef
63  %v = load volatile float, float addrspace(1)* undef
64  %mul.u.v = fmul fast float %u, %v
65  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
66  %add = fsub fast float %fma, %z
67  store volatile float %add, float addrspace(1)* undef
68  ret void
69}
70
71; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
72; GCN: buffer_load_dword [[X:v[0-9]+]]
73; GCN: buffer_load_dword [[Y:v[0-9]+]]
74; GCN: buffer_load_dword [[Z:v[0-9]+]]
75; GCN: buffer_load_dword [[U:v[0-9]+]]
76; GCN: buffer_load_dword [[V:v[0-9]+]]
77
78; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
79; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
80; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
81
82; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
83; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
84; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
85
86; GCN-SLOWFMA: v_mul_f32_e32
87; GCN-SLOWFMA: v_mul_f32_e32
88; GCN-SLOWFMA: v_add_f32_e32
89; GCN-SLOWFMA: v_add_f32_e32
90define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
91  %x = load volatile float, float addrspace(1)* undef
92  %y = load volatile float, float addrspace(1)* undef
93  %z = load volatile float, float addrspace(1)* undef
94  %u = load volatile float, float addrspace(1)* undef
95  %v = load volatile float, float addrspace(1)* undef
96  %mul.u.v = fmul fast float %u, %v
97  store volatile float %mul.u.v, float addrspace(1)* undef
98  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
99  %add = fadd fast float %fma, %z
100  store volatile float %add, float addrspace(1)* undef
101  ret void
102}
103
104; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
105; GCN: buffer_load_dword [[X:v[0-9]+]]
106; GCN: buffer_load_dword [[Y:v[0-9]+]]
107; GCN: buffer_load_dword [[Z:v[0-9]+]]
108; GCN: buffer_load_dword [[U:v[0-9]+]]
109; GCN: buffer_load_dword [[V:v[0-9]+]]
110
111; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
112; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
113; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
114
115; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
116; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
117; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
118
119; GCN-SLOWFMA: v_mul_f32_e32
120; GCN-SLOWFMA: v_mul_f32_e32
121; GCN-SLOWFMA: v_add_f32_e32
122; GCN-SLOWFMA: v_add_f32_e32
123define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
124  %x = load volatile float, float addrspace(1)* undef
125  %y = load volatile float, float addrspace(1)* undef
126  %z = load volatile float, float addrspace(1)* undef
127  %u = load volatile float, float addrspace(1)* undef
128  %v = load volatile float, float addrspace(1)* undef
129  %mul.u.v = fmul fast float %u, %v
130  store volatile float %mul.u.v, float addrspace(1)* undef
131  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
132  %add = fadd fast float %z, %fma
133  store volatile float %add, float addrspace(1)* undef
134  ret void
135}
136
137; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
138; GCN: buffer_load_dword [[X:v[0-9]+]]
139; GCN: buffer_load_dword [[Y:v[0-9]+]]
140; GCN: buffer_load_dword [[Z:v[0-9]+]]
141; GCN: buffer_load_dword [[U:v[0-9]+]]
142; GCN: buffer_load_dword [[V:v[0-9]+]]
143
144; GCN-SLOWFMA: v_mul_f32_e32
145; GCN-SLOWFMA: v_mul_f32_e32
146; GCN-SLOWFMA: v_add_f32_e32
147; GCN-SLOWFMA: v_add_f32_e32
148define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
149  %x = load volatile float, float addrspace(1)* undef
150  %y = load volatile float, float addrspace(1)* undef
151  %z = load volatile float, float addrspace(1)* undef
152  %u = load volatile float, float addrspace(1)* undef
153  %v = load volatile float, float addrspace(1)* undef
154  %mul.u.v = fmul fast float %u, %v
155  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
156  store volatile float %fma, float addrspace(1)* undef
157  %add = fadd fast float %fma, %z
158  store volatile float %add, float addrspace(1)* undef
159  ret void
160}
161
162; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
163; GCN: buffer_load_dword [[X:v[0-9]+]]
164; GCN: buffer_load_dword [[Y:v[0-9]+]]
165; GCN: buffer_load_dword [[Z:v[0-9]+]]
166; GCN: buffer_load_dword [[U:v[0-9]+]]
167; GCN: buffer_load_dword [[V:v[0-9]+]]
168
169; GCN-SLOWFMA: v_mul_f32_e32
170; GCN-SLOWFMA: v_mul_f32_e32
171; GCN-SLOWFMA: v_add_f32_e32
172; GCN-SLOWFMA: v_add_f32_e32
173define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
174  %x = load volatile float, float addrspace(1)* undef
175  %y = load volatile float, float addrspace(1)* undef
176  %z = load volatile float, float addrspace(1)* undef
177  %u = load volatile float, float addrspace(1)* undef
178  %v = load volatile float, float addrspace(1)* undef
179  %mul.u.v = fmul fast float %u, %v
180  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
181  store volatile float %fma, float addrspace(1)* undef
182  %add = fadd fast float %z, %fma
183  store volatile float %add, float addrspace(1)* undef
184  ret void
185}
186
187; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
188; GCN: buffer_load_dword [[X:v[0-9]+]]
189; GCN: buffer_load_dword [[Y:v[0-9]+]]
190; GCN: buffer_load_dword [[Z:v[0-9]+]]
191; GCN: buffer_load_dword [[U:v[0-9]+]]
192; GCN: buffer_load_dword [[V:v[0-9]+]]
193
194; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
195
196; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
197; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
198
199; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
200; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
201
202; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
203; GCN-SLOWFMA: v_add_f32_e32
204; GCN-SLOWFMA: v_sub_f32_e32 [[MAD:v[0-9]+]]
205
206; GCN: buffer_store_dword [[MUL]]
207; GCN: buffer_store_dword [[MAD]]
208define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
209  %x = load volatile float, float addrspace(1)* undef
210  %y = load volatile float, float addrspace(1)* undef
211  %z = load volatile float, float addrspace(1)* undef
212  %u = load volatile float, float addrspace(1)* undef
213  %v = load volatile float, float addrspace(1)* undef
214  %mul.u.v = fmul fast float %u, %v
215  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
216  %add = fsub fast float %fma, %z
217  store volatile float %mul.u.v, float addrspace(1)* undef
218  store volatile float %add, float addrspace(1)* undef
219  ret void
220}
221
222; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd:
223; GCN: buffer_load_dword [[X:v[0-9]+]]
224; GCN: buffer_load_dword [[Y:v[0-9]+]]
225; GCN: buffer_load_dword [[Z:v[0-9]+]]
226; GCN: buffer_load_dword [[U:v[0-9]+]]
227; GCN: buffer_load_dword [[V:v[0-9]+]]
228
229; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
230
231; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
232; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
233; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
234; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
235
236; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
237; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
238; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
239; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
240
241; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
242; GCN-SLOWFMA: v_add_f32_e32
243; GCN-SLOWFMA: v_sub_f32_e32
244define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
245  %x = load volatile float, float addrspace(1)* undef
246  %y = load volatile float, float addrspace(1)* undef
247  %z = load volatile float, float addrspace(1)* undef
248  %u = load volatile float, float addrspace(1)* undef
249  %v = load volatile float, float addrspace(1)* undef
250  %mul.u.v = fmul fast float %u, %v
251  %fma = call fast float @llvm.fmuladd.f32(float %x, float %y, float %mul.u.v)
252  %add = fsub fast float %fma, %z
253  store volatile float %fma, float addrspace(1)* undef
254  store volatile float %add, float addrspace(1)* undef
255  ret void
256}
257
258declare float @llvm.fma.f32(float, float, float) #1
259declare float @llvm.fmuladd.f32(float, float, float) #1
260
261attributes #0 = { nounwind }
262attributes #1 = { nounwind readnone }
263