1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
2
3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
5; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
6
7; Make sure we don't form mad with denormals
8; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
9; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
10
11declare i32 @llvm.amdgcn.workitem.id.x() #0
12declare float @llvm.fabs.f32(float) #0
13declare float @llvm.fma.f32(float, float, float) #0
14declare float @llvm.fmuladd.f32(float, float, float) #0
15
16; (fadd (fmul x, y), z) -> (fma x, y, z)
17; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
18; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
19; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
20; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
21
22; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
23
24; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
25
26; SI-DENORM-SLOWFMAF-NOT: v_fma
27; SI-DENORM-SLOWFMAF-NOT: v_mad
28
29; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
30; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
31
32; SI-DENORM: buffer_store_dword [[RESULT]]
33; SI-STD: buffer_store_dword [[C]]
34define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
35  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
36  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
37  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
38  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
39  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
40
41  %a = load volatile float, float addrspace(1)* %gep.0
42  %b = load volatile float, float addrspace(1)* %gep.1
43  %c = load volatile float, float addrspace(1)* %gep.2
44
45  %mul = fmul float %a, %b
46  %fma = fadd float %mul, %c
47  store float %fma, float addrspace(1)* %gep.out
48  ret void
49}
50
51; (fadd (fmul x, y), z) -> (fma x, y, z)
52; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
53; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
54; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
55; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
56; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
57
58; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
59; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
60
61; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
62; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
63
64; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
65; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
66; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
67
68; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
70; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
71; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
72; SI: s_endpgm
73define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
74  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
75  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
76  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
77  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
78  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
79  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
80  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
81
82  %a = load volatile float, float addrspace(1)* %gep.0
83  %b = load volatile float, float addrspace(1)* %gep.1
84  %c = load volatile float, float addrspace(1)* %gep.2
85  %d = load volatile float, float addrspace(1)* %gep.3
86
87  %mul = fmul float %a, %b
88  %fma0 = fadd float %mul, %c
89  %fma1 = fadd float %mul, %d
90
91  store volatile float %fma0, float addrspace(1)* %gep.out.0
92  store volatile float %fma1, float addrspace(1)* %gep.out.1
93  ret void
94}
95
96; (fadd x, (fmul y, z)) -> (fma y, z, x)
97; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
98; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
99; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
100; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
101
102; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
103; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
104
105; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
106; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
107
108; SI-DENORM: buffer_store_dword [[RESULT]]
109; SI-STD: buffer_store_dword [[C]]
110define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
111  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
112  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
113  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
114  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
115  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
116
117  %a = load volatile float, float addrspace(1)* %gep.0
118  %b = load volatile float, float addrspace(1)* %gep.1
119  %c = load volatile float, float addrspace(1)* %gep.2
120
121  %mul = fmul float %a, %b
122  %fma = fadd float %c, %mul
123  store float %fma, float addrspace(1)* %gep.out
124  ret void
125}
126
127; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
128; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
129; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
130; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
131; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
132
133; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
134; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
135
136; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
137; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
138
139; SI: buffer_store_dword [[RESULT]]
140define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
141  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
142  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
143  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
144  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
145  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
146
147  %a = load volatile float, float addrspace(1)* %gep.0
148  %b = load volatile float, float addrspace(1)* %gep.1
149  %c = load volatile float, float addrspace(1)* %gep.2
150
151  %mul = fmul float %a, %b
152  %fma = fsub float %mul, %c
153  store float %fma, float addrspace(1)* %gep.out
154  ret void
155}
156
157; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
158; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
159; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
160; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
161; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
162; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
163
164; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
165; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
166
167; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
168; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
169
170; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
171; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
172; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
173
174; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
175; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
176; SI: s_endpgm
177define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
178  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
179  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
180  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
181  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
182  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
183  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
184  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
185
186  %a = load volatile float, float addrspace(1)* %gep.0
187  %b = load volatile float, float addrspace(1)* %gep.1
188  %c = load volatile float, float addrspace(1)* %gep.2
189  %d = load volatile float, float addrspace(1)* %gep.3
190
191  %mul = fmul float %a, %b
192  %fma0 = fsub float %mul, %c
193  %fma1 = fsub float %mul, %d
194  store volatile float %fma0, float addrspace(1)* %gep.out.0
195  store volatile float %fma1, float addrspace(1)* %gep.out.1
196  ret void
197}
198
199; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
200; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
201; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
202; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
203; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
204
205; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
206; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
207
208; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
209; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
210
211; SI: buffer_store_dword [[RESULT]]
212define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
213  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
214  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
215  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
216  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
217  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
218
219  %a = load volatile float, float addrspace(1)* %gep.0
220  %b = load volatile float, float addrspace(1)* %gep.1
221  %c = load volatile float, float addrspace(1)* %gep.2
222
223  %mul = fmul float %a, %b
224  %fma = fsub float %c, %mul
225  store float %fma, float addrspace(1)* %gep.out
226  ret void
227}
228
229; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
230; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
231; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
232; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
233; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234
235; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
236; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
237
238; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
239; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
240
241; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
242; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
243; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
244
245; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
246; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
247; SI: s_endpgm
248define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
249  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
250  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
251  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
252  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
253  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
254  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
255  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
256
257  %a = load volatile float, float addrspace(1)* %gep.0
258  %b = load volatile float, float addrspace(1)* %gep.1
259  %c = load volatile float, float addrspace(1)* %gep.2
260  %d = load volatile float, float addrspace(1)* %gep.3
261
262  %mul = fmul float %a, %b
263  %fma0 = fsub float %c, %mul
264  %fma1 = fsub float %d, %mul
265  store volatile float %fma0, float addrspace(1)* %gep.out.0
266  store volatile float %fma1, float addrspace(1)* %gep.out.1
267  ret void
268}
269
270; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
271; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
272; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
273; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
274; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
275
276; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
277
278; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
279
280; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
281; SI-DENORM-SLOWFMAF: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[TMP]], [[C]]
282
283; SI: buffer_store_dword [[RESULT]]
284define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
285  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
286  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
287  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
288  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
289  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
290
291  %a = load volatile float, float addrspace(1)* %gep.0
292  %b = load volatile float, float addrspace(1)* %gep.1
293  %c = load volatile float, float addrspace(1)* %gep.2
294
295  %mul = fmul float %a, %b
296  %mul.neg = fsub float -0.0, %mul
297  %fma = fsub float %mul.neg, %c
298
299  store float %fma, float addrspace(1)* %gep.out
300  ret void
301}
302
303; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
304; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
305; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
306; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
307; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
308
309; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
310; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
311
312; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
313; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
314
315; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
316; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
317; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT1:v[0-9]+]], -[[TMP]], [[D]]
318
319; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
320; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
321; SI: s_endpgm
322define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
323  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
324  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
325  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
326  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
327  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
328  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
329  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
330
331  %a = load volatile float, float addrspace(1)* %gep.0
332  %b = load volatile float, float addrspace(1)* %gep.1
333  %c = load volatile float, float addrspace(1)* %gep.2
334  %d = load volatile float, float addrspace(1)* %gep.3
335
336  %mul = fmul float %a, %b
337  %mul.neg = fsub float -0.0, %mul
338  %fma0 = fsub float %mul.neg, %c
339  %fma1 = fsub float %mul.neg, %d
340
341  store volatile float %fma0, float addrspace(1)* %gep.out.0
342  store volatile float %fma1, float addrspace(1)* %gep.out.1
343  ret void
344}
345
346; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
347; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
348; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
349; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
350; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
351
352; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
353; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
354
355; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
356; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
357
358; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
359; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
360; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]]
361
362; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
363; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
364; SI: s_endpgm
365define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
366  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
367  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
368  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
369  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
370  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
371  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
372  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
373
374  %a = load volatile float, float addrspace(1)* %gep.0
375  %b = load volatile float, float addrspace(1)* %gep.1
376  %c = load volatile float, float addrspace(1)* %gep.2
377  %d = load volatile float, float addrspace(1)* %gep.3
378
379  %mul = fmul float %a, %b
380  %mul.neg = fsub float -0.0, %mul
381  %fma0 = fsub float %mul.neg, %c
382  %fma1 = fsub float %mul, %d
383
384  store volatile float %fma0, float addrspace(1)* %gep.out.0
385  store volatile float %fma1, float addrspace(1)* %gep.out.1
386  ret void
387}
388
389; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
390
391; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
392; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
393; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
394; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
395; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
396; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
397
398; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
399; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
400; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
401
402; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
403; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
404
405; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
406; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
407; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
408
409; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
410define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
411  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
412  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
413  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
414  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
415  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
416  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
417  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
418
419  %x = load volatile float, float addrspace(1)* %gep.0
420  %y = load volatile float, float addrspace(1)* %gep.1
421  %z = load volatile float, float addrspace(1)* %gep.2
422  %u = load volatile float, float addrspace(1)* %gep.3
423  %v = load volatile float, float addrspace(1)* %gep.4
424
425  %tmp0 = fmul float %u, %v
426  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
427  %tmp2 = fsub float %tmp1, %z
428
429  store float %tmp2, float addrspace(1)* %gep.out
430  ret void
431}
432
433; fold (fsub x, (fma y, z, (fmul u, v)))
434;   -> (fma (fneg y), z, (fma (fneg u), v, x))
435
436; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
437; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
438; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
439; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
440; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
441; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
442
443; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
444; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
445; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
446
447; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
448; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
449
450; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
451; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
452; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
453
454; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
455; SI: s_endpgm
456define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
457  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
458  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
459  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
460  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
461  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
462  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
463  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
464
465  %x = load volatile float, float addrspace(1)* %gep.0
466  %y = load volatile float, float addrspace(1)* %gep.1
467  %z = load volatile float, float addrspace(1)* %gep.2
468  %u = load volatile float, float addrspace(1)* %gep.3
469  %v = load volatile float, float addrspace(1)* %gep.4
470
471  %tmp0 = fmul float %u, %v
472  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
473  %tmp2 = fsub float %x, %tmp1
474
475  store float %tmp2, float addrspace(1)* %gep.out
476  ret void
477}
478
479; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
480
481; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
482; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
483; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
484; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
485; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
486; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
487
488; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
489; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]]
490
491; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
492; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
493
494; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
495; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
496; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
497; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
498
499; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
500; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
501; SI: s_endpgm
502define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
503  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
504  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
505  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
506  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
507  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
508  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
509  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
510
511  %x = load volatile float, float addrspace(1)* %gep.0
512  %y = load volatile float, float addrspace(1)* %gep.1
513  %z = load volatile float, float addrspace(1)* %gep.2
514  %u = load volatile float, float addrspace(1)* %gep.3
515  %v = load volatile float, float addrspace(1)* %gep.4
516
517  %tmp0 = fmul float %u, %v
518  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
519  %tmp2 = fsub float %tmp1, %z
520
521  store float %tmp2, float addrspace(1)* %gep.out
522  ret void
523}
524
525; fold (fsub x, (fmuladd y, z, (fmul u, v)))
526;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
527
528; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
529; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
530; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
531; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
532; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
533; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
534
535; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
536; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
537
538; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
539; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
540
541; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
542; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
543; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
544; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[A]]
545
546; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
547; SI: s_endpgm
548define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
549  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
550  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
551  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
552  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
553  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
554  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
555  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
556
557  %x = load volatile float, float addrspace(1)* %gep.0
558  %y = load volatile float, float addrspace(1)* %gep.1
559  %z = load volatile float, float addrspace(1)* %gep.2
560  %u = load volatile float, float addrspace(1)* %gep.3
561  %v = load volatile float, float addrspace(1)* %gep.4
562
563  %tmp0 = fmul float %u, %v
564  %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
565  %tmp2 = fsub float %x, %tmp1
566
567  store float %tmp2, float addrspace(1)* %gep.out
568  ret void
569}
570
571attributes #0 = { nounwind readnone }
572attributes #1 = { nounwind }
573