1; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
4
5; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
6; beneficial even without fp32 denormals, but they do require no-infs-fp-math
7; for correctness.
8
9declare i32 @llvm.amdgcn.workitem.id.x() #0
10declare double @llvm.fabs.f64(double) #0
11declare double @llvm.fma.f64(double, double, double) #0
12declare float @llvm.fma.f32(float, float, float) #0
13
14; (fadd (fmul x, y), z) -> (fma x, y, z)
15; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
16; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
17; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
18; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
19; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
20; SI: buffer_store_dwordx2 [[RESULT]]
21define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
22  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
23  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
24  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
25  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
26  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
27
28  %a = load volatile double, double addrspace(1)* %gep.0
29  %b = load volatile double, double addrspace(1)* %gep.1
30  %c = load volatile double, double addrspace(1)* %gep.2
31
32  %mul = fmul double %a, %b
33  %fma = fadd double %mul, %c
34  store double %fma, double addrspace(1)* %gep.out
35  ret void
36}
37
38; (fadd (fmul x, y), z) -> (fma x, y, z)
39; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
40; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
41; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
42; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
43; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
44; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
45; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
46; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
47; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
48; SI: s_endpgm
49define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
50  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
51  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
52  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
53  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
54  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
55  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
56  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
57
58  %a = load volatile double, double addrspace(1)* %gep.0
59  %b = load volatile double, double addrspace(1)* %gep.1
60  %c = load volatile double, double addrspace(1)* %gep.2
61  %d = load volatile double, double addrspace(1)* %gep.3
62
63  %mul = fmul double %a, %b
64  %fma0 = fadd double %mul, %c
65  %fma1 = fadd double %mul, %d
66  store volatile double %fma0, double addrspace(1)* %gep.out.0
67  store volatile double %fma1, double addrspace(1)* %gep.out.1
68  ret void
69}
70
71; (fadd x, (fmul y, z)) -> (fma y, z, x)
72; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
73; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
74; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
75; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
76; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
77; SI: buffer_store_dwordx2 [[RESULT]]
78define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
79  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
80  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
81  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
82  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
83  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
84
85  %a = load volatile double, double addrspace(1)* %gep.0
86  %b = load volatile double, double addrspace(1)* %gep.1
87  %c = load volatile double, double addrspace(1)* %gep.2
88
89  %mul = fmul double %a, %b
90  %fma = fadd double %c, %mul
91  store double %fma, double addrspace(1)* %gep.out
92  ret void
93}
94
95; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
96; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
97; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
98; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
99; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
100; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
101; SI: buffer_store_dwordx2 [[RESULT]]
102define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
103  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
104  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
105  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
106  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
107  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
108
109  %a = load volatile double, double addrspace(1)* %gep.0
110  %b = load volatile double, double addrspace(1)* %gep.1
111  %c = load volatile double, double addrspace(1)* %gep.2
112
113  %mul = fmul double %a, %b
114  %fma = fsub double %mul, %c
115  store double %fma, double addrspace(1)* %gep.out
116  ret void
117}
118
119; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
120; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
121; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
122; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
123; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
124; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
125; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
126; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
127; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
128; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
129; SI: s_endpgm
130define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
131  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
132  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
133  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
134  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
135  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
136  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
137  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
138
139  %a = load volatile double, double addrspace(1)* %gep.0
140  %b = load volatile double, double addrspace(1)* %gep.1
141  %c = load volatile double, double addrspace(1)* %gep.2
142  %d = load volatile double, double addrspace(1)* %gep.3
143
144  %mul = fmul double %a, %b
145  %fma0 = fsub double %mul, %c
146  %fma1 = fsub double %mul, %d
147  store volatile double %fma0, double addrspace(1)* %gep.out.0
148  store volatile double %fma1, double addrspace(1)* %gep.out.1
149  ret void
150}
151
152; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
153; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
154; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
155; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
156; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
157; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
158; SI: buffer_store_dwordx2 [[RESULT]]
159define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
160  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
161  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
162  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
163  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
164  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
165
166  %a = load volatile double, double addrspace(1)* %gep.0
167  %b = load volatile double, double addrspace(1)* %gep.1
168  %c = load volatile double, double addrspace(1)* %gep.2
169
170  %mul = fmul double %a, %b
171  %fma = fsub double %c, %mul
172  store double %fma, double addrspace(1)* %gep.out
173  ret void
174}
175
176; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
177; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
178; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
179; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
180; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
181; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
182; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
183; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
184; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
185; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
186; SI: s_endpgm
187define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
188  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
189  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
190  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
191  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
192  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
193  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
194  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
195
196  %a = load volatile double, double addrspace(1)* %gep.0
197  %b = load volatile double, double addrspace(1)* %gep.1
198  %c = load volatile double, double addrspace(1)* %gep.2
199  %d = load volatile double, double addrspace(1)* %gep.3
200
201  %mul = fmul double %a, %b
202  %fma0 = fsub double %c, %mul
203  %fma1 = fsub double %d, %mul
204  store volatile double %fma0, double addrspace(1)* %gep.out.0
205  store volatile double %fma1, double addrspace(1)* %gep.out.1
206  ret void
207}
208
209; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
210; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
211; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
212; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
213; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
214; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
215; SI: buffer_store_dwordx2 [[RESULT]]
216define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
217  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
218  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
219  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
220  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
221  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
222
223  %a = load volatile double, double addrspace(1)* %gep.0
224  %b = load volatile double, double addrspace(1)* %gep.1
225  %c = load volatile double, double addrspace(1)* %gep.2
226
227  %mul = fmul double %a, %b
228  %mul.neg = fsub double -0.0, %mul
229  %fma = fsub double %mul.neg, %c
230
231  store double %fma, double addrspace(1)* %gep.out
232  ret void
233}
234
235; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
236; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
237; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
240; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
241; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
242; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
243; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
244; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
245; SI: s_endpgm
246define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
247  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
248  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
249  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
250  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
251  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
252  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
253  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
254
255  %a = load volatile double, double addrspace(1)* %gep.0
256  %b = load volatile double, double addrspace(1)* %gep.1
257  %c = load volatile double, double addrspace(1)* %gep.2
258  %d = load volatile double, double addrspace(1)* %gep.3
259
260  %mul = fmul double %a, %b
261  %mul.neg = fsub double -0.0, %mul
262  %fma0 = fsub double %mul.neg, %c
263  %fma1 = fsub double %mul.neg, %d
264
265  store volatile double %fma0, double addrspace(1)* %gep.out.0
266  store volatile double %fma1, double addrspace(1)* %gep.out.1
267  ret void
268}
269
270; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
271; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
272; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
273; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
274; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
275; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
276; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
277; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
278; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
279; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
280; SI: s_endpgm
281define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
282  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
283  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
284  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
285  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
286  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
287  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
288  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
289
290  %a = load volatile double, double addrspace(1)* %gep.0
291  %b = load volatile double, double addrspace(1)* %gep.1
292  %c = load volatile double, double addrspace(1)* %gep.2
293  %d = load volatile double, double addrspace(1)* %gep.3
294
295  %mul = fmul double %a, %b
296  %mul.neg = fsub double -0.0, %mul
297  %fma0 = fsub double %mul.neg, %c
298  %fma1 = fsub double %mul, %d
299
300  store volatile double %fma0, double addrspace(1)* %gep.out.0
301  store volatile double %fma1, double addrspace(1)* %gep.out.1
302  ret void
303}
304
305; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
306
307; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
308; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
309; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
310; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
311; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
312; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
313
314; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
315; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
316; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
317
318; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
319; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
320
321; SI: buffer_store_dwordx2 [[RESULT]]
322define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
323  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
324  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
325  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
326  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
327  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
328  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
329  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
330
331  %x = load volatile double, double addrspace(1)* %gep.0
332  %y = load volatile double, double addrspace(1)* %gep.1
333  %z = load volatile double, double addrspace(1)* %gep.2
334  %u = load volatile double, double addrspace(1)* %gep.3
335  %v = load volatile double, double addrspace(1)* %gep.4
336
337  %tmp0 = fmul double %u, %v
338  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
339  %tmp2 = fsub double %tmp1, %z
340
341  store double %tmp2, double addrspace(1)* %gep.out
342  ret void
343}
344
345; fold (fsub x, (fma y, z, (fmul u, v)))
346;   -> (fma (fneg y), z, (fma (fneg u), v, x))
347
348; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
349; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
350; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
351; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
352; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
353; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
354
355; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
356; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
357; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
358
359; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
360; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
361
362; SI: buffer_store_dwordx2 [[RESULT]]
363define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
364  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
365  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
366  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
367  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
368  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
369  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
370  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
371
372  %x = load volatile double, double addrspace(1)* %gep.0
373  %y = load volatile double, double addrspace(1)* %gep.1
374  %z = load volatile double, double addrspace(1)* %gep.2
375  %u = load volatile double, double addrspace(1)* %gep.3
376  %v = load volatile double, double addrspace(1)* %gep.4
377
378  %tmp0 = fmul double %u, %v
379  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
380  %tmp2 = fsub double %x, %tmp1
381
382  store double %tmp2, double addrspace(1)* %gep.out
383  ret void
384}
385
386;
387; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
388;
389
390; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
391; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
392; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
393;
394; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
395define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
396                                        float addrspace(1)* %in1,
397                                        float addrspace(1)* %in2) {
398  %x = load volatile float, float addrspace(1)* %in1
399  %y = load volatile float, float addrspace(1)* %in2
400  %a = fadd float %x, 1.0
401  %m = fmul float %a, %y
402  store float %m, float addrspace(1)* %out
403  ret void
404}
405
406; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
407; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
408; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
409;
410; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
411define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
412                                        float addrspace(1)* %in1,
413                                        float addrspace(1)* %in2) {
414  %x = load volatile float, float addrspace(1)* %in1
415  %y = load volatile float, float addrspace(1)* %in2
416  %a = fadd float %x, 1.0
417  %m = fmul float %y, %a
418  store float %m, float addrspace(1)* %out
419  ret void
420}
421
422; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
423; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
424; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
425;
426; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
427define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
428                                           float addrspace(1)* %in1,
429                                           float addrspace(1)* %in2) {
430  %x = load float, float addrspace(1)* %in1
431  %y = load float, float addrspace(1)* %in2
432  %a = fadd float %x, -1.0
433  %m = fmul float %a, %y
434  store float %m, float addrspace(1)* %out
435  ret void
436}
437
438; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
439; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
440; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
441;
442; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
443define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
444                                           float addrspace(1)* %in1,
445                                           float addrspace(1)* %in2) {
446  %x = load float, float addrspace(1)* %in1
447  %y = load float, float addrspace(1)* %in2
448  %a = fadd float %x, -1.0
449  %m = fmul float %y, %a
450  store float %m, float addrspace(1)* %out
451  ret void
452}
453
454; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
455; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
456; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
457;
458; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
459define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
460                                        float addrspace(1)* %in1,
461                                        float addrspace(1)* %in2) {
462  %x = load float, float addrspace(1)* %in1
463  %y = load float, float addrspace(1)* %in2
464  %s = fsub float 1.0, %x
465  %m = fmul float %s, %y
466  store float %m, float addrspace(1)* %out
467  ret void
468}
469
470; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
471; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
472; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
473;
474; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
475define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
476                                        float addrspace(1)* %in1,
477                                        float addrspace(1)* %in2) {
478  %x = load float, float addrspace(1)* %in1
479  %y = load float, float addrspace(1)* %in2
480  %s = fsub float 1.0, %x
481  %m = fmul float %y, %s
482  store float %m, float addrspace(1)* %out
483  ret void
484}
485
486; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
487; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
488; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
489;
490; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
491define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
492                                           float addrspace(1)* %in1,
493                                           float addrspace(1)* %in2) {
494  %x = load float, float addrspace(1)* %in1
495  %y = load float, float addrspace(1)* %in2
496  %s = fsub float -1.0, %x
497  %m = fmul float %s, %y
498  store float %m, float addrspace(1)* %out
499  ret void
500}
501
502; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
503; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
504; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
505;
506; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
507define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
508                                         float addrspace(1)* %in1,
509                                         float addrspace(1)* %in2) {
510  %x = load float, float addrspace(1)* %in1
511  %y = load float, float addrspace(1)* %in2
512  %s = fsub float -1.0, %x
513  %m = fmul float %y, %s
514  store float %m, float addrspace(1)* %out
515  ret void
516}
517
518; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
519; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
520; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
521;
522; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
523define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
524                                        float addrspace(1)* %in1,
525                                        float addrspace(1)* %in2) {
526  %x = load float, float addrspace(1)* %in1
527  %y = load float, float addrspace(1)* %in2
528  %s = fsub float %x, 1.0
529  %m = fmul float %s, %y
530  store float %m, float addrspace(1)* %out
531  ret void
532}
533
534; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
535; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
536; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
537;
538; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
539define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
540                                      float addrspace(1)* %in1,
541                                      float addrspace(1)* %in2) {
542  %x = load float, float addrspace(1)* %in1
543  %y = load float, float addrspace(1)* %in2
544  %s = fsub float %x, 1.0
545  %m = fmul float %y, %s
546  store float %m, float addrspace(1)* %out
547  ret void
548}
549
550; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
551; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
552; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
553;
554; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
555define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
556                                         float addrspace(1)* %in1,
557                                         float addrspace(1)* %in2) {
558  %x = load float, float addrspace(1)* %in1
559  %y = load float, float addrspace(1)* %in2
560  %s = fsub float %x, -1.0
561  %m = fmul float %s, %y
562  store float %m, float addrspace(1)* %out
563  ret void
564}
565
566; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
567; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
568; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
569;
570; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
571define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
572                                         float addrspace(1)* %in1,
573                                         float addrspace(1)* %in2) {
574  %x = load float, float addrspace(1)* %in1
575  %y = load float, float addrspace(1)* %in2
576  %s = fsub float %x, -1.0
577  %m = fmul float %y, %s
578  store float %m, float addrspace(1)* %out
579  ret void
580}
581
582;
583; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
584;
585
586; FUNC-LABEL: {{^}}test_f32_interp:
587; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
588; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VY:v[0-9]]], [[VT1]]
589; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VX:v[0-9]]], [[VT]]
590;
591; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
592; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
593define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
594                             float addrspace(1)* %in1,
595                             float addrspace(1)* %in2,
596                             float addrspace(1)* %in3) {
597  %x = load float, float addrspace(1)* %in1
598  %y = load float, float addrspace(1)* %in2
599  %t = load float, float addrspace(1)* %in3
600  %t1 = fsub float 1.0, %t
601  %tx = fmul float %x, %t
602  %ty = fmul float %y, %t1
603  %r = fadd float %tx, %ty
604  store float %r, float addrspace(1)* %out
605  ret void
606}
607
608; FUNC-LABEL: {{^}}test_f64_interp:
609; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
610; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
611; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
612;
613; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
614; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
615define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
616                             double addrspace(1)* %in1,
617                             double addrspace(1)* %in2,
618                             double addrspace(1)* %in3) {
619  %x = load double, double addrspace(1)* %in1
620  %y = load double, double addrspace(1)* %in2
621  %t = load double, double addrspace(1)* %in3
622  %t1 = fsub double 1.0, %t
623  %tx = fmul double %x, %t
624  %ty = fmul double %y, %t1
625  %r = fadd double %tx, %ty
626  store double %r, double addrspace(1)* %out
627  ret void
628}
629
630; Make sure negative constant cancels out fneg
631; GCN-LABEL: {{^}}fma_neg_2.0_neg_a_b_f32:
632; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
633; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
634; GCN-NOT: [[A]]
635; GCN-NOT: [[B]]
636; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], 2.0, [[B]]
637define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
638  %tid = call i32 @llvm.amdgcn.workitem.id.x()
639  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
640  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
641  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
642
643  %r1 = load volatile float, float addrspace(1)* %gep.0
644  %r2 = load volatile float, float addrspace(1)* %gep.1
645
646  %r1.fneg = fsub float -0.000000e+00, %r1
647
648  %r3 = tail call float @llvm.fma.f32(float -2.0, float %r1.fneg, float %r2)
649  store float %r3, float addrspace(1)* %gep.out
650  ret void
651}
652
653; GCN-LABEL: {{^}}fma_2.0_neg_a_b_f32:
654; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
655; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
656; GCN-NOT: [[A]]
657; GCN-NOT: [[B]]
658; GCN: v_fma_f32 v{{[0-9]+}}, [[A]], -2.0, [[B]]
659define amdgpu_kernel void @fma_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
660  %tid = call i32 @llvm.amdgcn.workitem.id.x()
661  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
662  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
663  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
664
665  %r1 = load volatile float, float addrspace(1)* %gep.0
666  %r2 = load volatile float, float addrspace(1)* %gep.1
667
668  %r1.fneg = fsub float -0.000000e+00, %r1
669
670  %r3 = tail call float @llvm.fma.f32(float 2.0, float %r1.fneg, float %r2)
671  store float %r3, float addrspace(1)* %gep.out
672  ret void
673}
674
675attributes #0 = { nounwind readnone }
676attributes #1 = { nounwind }
677
678