1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-FASTFMAF -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-SLOWFMAF -check-prefix=SI -check-prefix=FUNC %s
3
4declare i32 @llvm.r600.read.tidig.x() #0
5declare double @llvm.fabs.f64(double) #0
6declare double @llvm.fma.f64(double, double, double) #0
7declare float @llvm.fma.f32(float, float, float) #0
8
9; (fadd (fmul x, y), z) -> (fma x, y, z)
10; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
11; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
12; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
13; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
14; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
15; SI: buffer_store_dwordx2 [[RESULT]]
16define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
17  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
18  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
19  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
20  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
21  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
22
23  %a = load double, double addrspace(1)* %gep.0
24  %b = load double, double addrspace(1)* %gep.1
25  %c = load double, double addrspace(1)* %gep.2
26
27  %mul = fmul double %a, %b
28  %fma = fadd double %mul, %c
29  store double %fma, double addrspace(1)* %gep.out
30  ret void
31}
32
33; (fadd (fmul x, y), z) -> (fma x, y, z)
34; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
35; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
36; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
37; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
38; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
39; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
40; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
41; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
42; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
43; SI: s_endpgm
44define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
45  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
46  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
47  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
48  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
49  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
50  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
51  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
52
53  %a = load double, double addrspace(1)* %gep.0
54  %b = load double, double addrspace(1)* %gep.1
55  %c = load double, double addrspace(1)* %gep.2
56  %d = load double, double addrspace(1)* %gep.3
57
58  %mul = fmul double %a, %b
59  %fma0 = fadd double %mul, %c
60  %fma1 = fadd double %mul, %d
61  store double %fma0, double addrspace(1)* %gep.out.0
62  store double %fma1, double addrspace(1)* %gep.out.1
63  ret void
64}
65
66; (fadd x, (fmul y, z)) -> (fma y, z, x)
67; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
68; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
69; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
70; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
71; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
72; SI: buffer_store_dwordx2 [[RESULT]]
73define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
74  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
75  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
76  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
77  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
78  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
79
80  %a = load double, double addrspace(1)* %gep.0
81  %b = load double, double addrspace(1)* %gep.1
82  %c = load double, double addrspace(1)* %gep.2
83
84  %mul = fmul double %a, %b
85  %fma = fadd double %c, %mul
86  store double %fma, double addrspace(1)* %gep.out
87  ret void
88}
89
90; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
91; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
92; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
93; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
94; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
95; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
96; SI: buffer_store_dwordx2 [[RESULT]]
97define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
98  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
99  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
100  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
101  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
102  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
103
104  %a = load double, double addrspace(1)* %gep.0
105  %b = load double, double addrspace(1)* %gep.1
106  %c = load double, double addrspace(1)* %gep.2
107
108  %mul = fmul double %a, %b
109  %fma = fsub double %mul, %c
110  store double %fma, double addrspace(1)* %gep.out
111  ret void
112}
113
114; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
115; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
116; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
117; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
118; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
119; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
120; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
121; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
122; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
123; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
124; SI: s_endpgm
125define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
126  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
127  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
128  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
129  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
130  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
131  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
132  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
133
134  %a = load double, double addrspace(1)* %gep.0
135  %b = load double, double addrspace(1)* %gep.1
136  %c = load double, double addrspace(1)* %gep.2
137  %d = load double, double addrspace(1)* %gep.3
138
139  %mul = fmul double %a, %b
140  %fma0 = fsub double %mul, %c
141  %fma1 = fsub double %mul, %d
142  store double %fma0, double addrspace(1)* %gep.out.0
143  store double %fma1, double addrspace(1)* %gep.out.1
144  ret void
145}
146
147; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
148; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
149; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
150; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
151; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
152; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
153; SI: buffer_store_dwordx2 [[RESULT]]
154define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
155  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
156  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
157  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
158  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
159  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
160
161  %a = load double, double addrspace(1)* %gep.0
162  %b = load double, double addrspace(1)* %gep.1
163  %c = load double, double addrspace(1)* %gep.2
164
165  %mul = fmul double %a, %b
166  %fma = fsub double %c, %mul
167  store double %fma, double addrspace(1)* %gep.out
168  ret void
169}
170
171; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
172; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
173; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
174; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
175; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
176; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
177; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
178; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
179; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
180; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
181; SI: s_endpgm
182define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
183  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
184  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
185  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
186  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
187  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
188  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
189  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
190
191  %a = load double, double addrspace(1)* %gep.0
192  %b = load double, double addrspace(1)* %gep.1
193  %c = load double, double addrspace(1)* %gep.2
194  %d = load double, double addrspace(1)* %gep.3
195
196  %mul = fmul double %a, %b
197  %fma0 = fsub double %c, %mul
198  %fma1 = fsub double %d, %mul
199  store double %fma0, double addrspace(1)* %gep.out.0
200  store double %fma1, double addrspace(1)* %gep.out.1
201  ret void
202}
203
204; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
205; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
206; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
207; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
208; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
209; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
210; SI: buffer_store_dwordx2 [[RESULT]]
211define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
212  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
213  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
214  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
215  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
216  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
217
218  %a = load double, double addrspace(1)* %gep.0
219  %b = load double, double addrspace(1)* %gep.1
220  %c = load double, double addrspace(1)* %gep.2
221
222  %mul = fmul double %a, %b
223  %mul.neg = fsub double -0.0, %mul
224  %fma = fsub double %mul.neg, %c
225
226  store double %fma, double addrspace(1)* %gep.out
227  ret void
228}
229
230; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
231; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
232; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
233; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
234; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
235; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
236; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
237; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239; SI: s_endpgm
240define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
241  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
242  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
243  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
244  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
245  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
246  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
247  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
248
249  %a = load double, double addrspace(1)* %gep.0
250  %b = load double, double addrspace(1)* %gep.1
251  %c = load double, double addrspace(1)* %gep.2
252  %d = load double, double addrspace(1)* %gep.3
253
254  %mul = fmul double %a, %b
255  %mul.neg = fsub double -0.0, %mul
256  %fma0 = fsub double %mul.neg, %c
257  %fma1 = fsub double %mul.neg, %d
258
259  store double %fma0, double addrspace(1)* %gep.out.0
260  store double %fma1, double addrspace(1)* %gep.out.1
261  ret void
262}
263
264; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
265; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
266; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
267; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
268; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
269; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
270; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
271; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
272; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
273; SI: s_endpgm
274define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
275  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
276  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
277  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
278  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
279  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
280  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
281  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
282
283  %a = load double, double addrspace(1)* %gep.0
284  %b = load double, double addrspace(1)* %gep.1
285  %c = load double, double addrspace(1)* %gep.2
286  %d = load double, double addrspace(1)* %gep.3
287
288  %mul = fmul double %a, %b
289  %mul.neg = fsub double -0.0, %mul
290  %fma0 = fsub double %mul.neg, %c
291  %fma1 = fsub double %mul, %d
292
293  store double %fma0, double addrspace(1)* %gep.out.0
294  store double %fma1, double addrspace(1)* %gep.out.1
295  ret void
296}
297
298; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
299
300; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
301; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
302; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
303; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
304; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
305; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
306; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
307; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
308; SI: buffer_store_dwordx2 [[RESULT]]
309define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
310  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
311  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
312  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
313  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
314  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
315  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
316  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
317
318  %x = load double, double addrspace(1)* %gep.0
319  %y = load double, double addrspace(1)* %gep.1
320  %z = load double, double addrspace(1)* %gep.2
321  %u = load double, double addrspace(1)* %gep.3
322  %v = load double, double addrspace(1)* %gep.4
323
324  %tmp0 = fmul double %u, %v
325  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
326  %tmp2 = fsub double %tmp1, %z
327
328  store double %tmp2, double addrspace(1)* %gep.out
329  ret void
330}
331
332; fold (fsub x, (fma y, z, (fmul u, v)))
333;   -> (fma (fneg y), z, (fma (fneg u), v, x))
334
335; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
336; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
337; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
338; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
339; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
340; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
341; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
342; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
343; SI: buffer_store_dwordx2 [[RESULT]]
344define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
345  %tid = tail call i32 @llvm.r600.read.tidig.x() #0
346  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
347  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
348  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
349  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
350  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
351  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
352
353  %x = load double, double addrspace(1)* %gep.0
354  %y = load double, double addrspace(1)* %gep.1
355  %z = load double, double addrspace(1)* %gep.2
356  %u = load double, double addrspace(1)* %gep.3
357  %v = load double, double addrspace(1)* %gep.4
358
359  %tmp0 = fmul double %u, %v
360  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
361  %tmp2 = fsub double %x, %tmp1
362
363  store double %tmp2, double addrspace(1)* %gep.out
364  ret void
365}
366
367attributes #0 = { nounwind readnone }
368attributes #1 = { nounwind }
369