1; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
2
3; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
5; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
6
7; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
8
9; Make sure we don't form mad with denormals
10; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
11; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
12
13declare i32 @llvm.amdgcn.workitem.id.x() #0
14declare float @llvm.fabs.f32(float) #0
15declare float @llvm.fma.f32(float, float, float) #0
16declare float @llvm.fmuladd.f32(float, float, float) #0
17
18; (fadd (fmul x, y), z) -> (fma x, y, z)
19; FUNC-LABEL: {{^}}combine_to_mad_f32_0:
20; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
21; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
22; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
23
24; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
25
26; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
27
28; SI-DENORM-SLOWFMAF-NOT: v_fma
29; SI-DENORM-SLOWFMAF-NOT: v_mad
30
31; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
32; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]],  [[TMP]], [[C]]
33
34; SI-DENORM: buffer_store_dword [[RESULT]]
35; SI-STD: buffer_store_dword [[C]]
36define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
37  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
38  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
39  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
40  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
41  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
42
43  %a = load volatile float, float addrspace(1)* %gep.0
44  %b = load volatile float, float addrspace(1)* %gep.1
45  %c = load volatile float, float addrspace(1)* %gep.2
46
47  %mul = fmul float %a, %b
48  %fma = fadd float %mul, %c
49  store float %fma, float addrspace(1)* %gep.out
50  ret void
51}
52
53; (fadd (fmul x, y), z) -> (fma x, y, z)
54; FUNC-LABEL: {{^}}combine_to_mad_f32_0_2use:
55; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
56; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
57; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
58; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
59
60; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
61; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
62
63; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
64; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
65
66; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
67; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
68; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
69
70; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
71; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
72; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
73; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
74; SI: s_endpgm
75define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
76  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
77  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
78  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
79  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
80  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
81  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
82  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
83
84  %a = load volatile float, float addrspace(1)* %gep.0
85  %b = load volatile float, float addrspace(1)* %gep.1
86  %c = load volatile float, float addrspace(1)* %gep.2
87  %d = load volatile float, float addrspace(1)* %gep.3
88
89  %mul = fmul float %a, %b
90  %fma0 = fadd float %mul, %c
91  %fma1 = fadd float %mul, %d
92
93  store volatile float %fma0, float addrspace(1)* %gep.out.0
94  store volatile float %fma1, float addrspace(1)* %gep.out.1
95  ret void
96}
97
98; (fadd x, (fmul y, z)) -> (fma y, z, x)
99; FUNC-LABEL: {{^}}combine_to_mad_f32_1:
100; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
101; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
102; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
103
104; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
105; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
106
107; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
108; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
109
110; SI-DENORM: buffer_store_dword [[RESULT]]
111; SI-STD: buffer_store_dword [[C]]
112define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
113  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
114  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
115  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
116  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
117  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
118
119  %a = load volatile float, float addrspace(1)* %gep.0
120  %b = load volatile float, float addrspace(1)* %gep.1
121  %c = load volatile float, float addrspace(1)* %gep.2
122
123  %mul = fmul float %a, %b
124  %fma = fadd float %c, %mul
125  store float %fma, float addrspace(1)* %gep.out
126  ret void
127}
128
129; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
130; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32:
131; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
132; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
133; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
134
135; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
136; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
137
138; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
139; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
140
141; SI: buffer_store_dword [[RESULT]]
142define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
143  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
144  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
145  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
146  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
147  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
148
149  %a = load volatile float, float addrspace(1)* %gep.0
150  %b = load volatile float, float addrspace(1)* %gep.1
151  %c = load volatile float, float addrspace(1)* %gep.2
152
153  %mul = fmul float %a, %b
154  %fma = fsub float %mul, %c
155  store float %fma, float addrspace(1)* %gep.out
156  ret void
157}
158
159; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
160; FUNC-LABEL: {{^}}combine_to_mad_fsub_0_f32_2use:
161; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
162; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
163; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
164; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
165
166; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
167; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
168
169; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
170; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
171
172; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
173; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
174; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
175
176; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
177; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
178; SI: s_endpgm
179define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
180  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
181  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
182  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
183  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
184  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
185  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
186  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
187
188  %a = load volatile float, float addrspace(1)* %gep.0
189  %b = load volatile float, float addrspace(1)* %gep.1
190  %c = load volatile float, float addrspace(1)* %gep.2
191  %d = load volatile float, float addrspace(1)* %gep.3
192
193  %mul = fmul float %a, %b
194  %fma0 = fsub float %mul, %c
195  %fma1 = fsub float %mul, %d
196  store volatile float %fma0, float addrspace(1)* %gep.out.0
197  store volatile float %fma1, float addrspace(1)* %gep.out.1
198  ret void
199}
200
201; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
202; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32:
203; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
204; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
205; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
206
207; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
208; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
209
210; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
211; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
212
213; SI: buffer_store_dword [[RESULT]]
214define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
215  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
216  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
217  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
218  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
219  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
220
221  %a = load volatile float, float addrspace(1)* %gep.0
222  %b = load volatile float, float addrspace(1)* %gep.1
223  %c = load volatile float, float addrspace(1)* %gep.2
224
225  %mul = fmul float %a, %b
226  %fma = fsub float %c, %mul
227  store float %fma, float addrspace(1)* %gep.out
228  ret void
229}
230
231; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
232; FUNC-LABEL: {{^}}combine_to_mad_fsub_1_f32_2use:
233; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
234; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
235; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
236; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
237
238; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
239; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
240
241; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
242; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
243
244; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
245; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
246; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[D]], [[TMP]]
247
248; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
249; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
250; SI: s_endpgm
251define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
252  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
253  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
254  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
255  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
256  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
257  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
258  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
259
260  %a = load volatile float, float addrspace(1)* %gep.0
261  %b = load volatile float, float addrspace(1)* %gep.1
262  %c = load volatile float, float addrspace(1)* %gep.2
263  %d = load volatile float, float addrspace(1)* %gep.3
264
265  %mul = fmul float %a, %b
266  %fma0 = fsub float %c, %mul
267  %fma1 = fsub float %d, %mul
268  store volatile float %fma0, float addrspace(1)* %gep.out.0
269  store volatile float %fma1, float addrspace(1)* %gep.out.1
270  ret void
271}
272
273; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
274; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32:
275; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
276; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
277; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
278
279; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
280
281; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
282
283; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
284; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
285
286; SI: buffer_store_dword [[RESULT]]
287define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
288  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
289  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
290  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
291  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
292  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
293
294  %a = load volatile float, float addrspace(1)* %gep.0
295  %b = load volatile float, float addrspace(1)* %gep.1
296  %c = load volatile float, float addrspace(1)* %gep.2
297
298  %mul = fmul float %a, %b
299  %mul.neg = fsub float -0.0, %mul
300  %fma = fsub float %mul.neg, %c
301
302  store float %fma, float addrspace(1)* %gep.out
303  ret void
304}
305
306; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
307; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_neg:
308; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
309; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
310; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
311; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
312
313; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
314; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
315
316; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
317; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
318
319; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
320; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
321; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]],  [[TMP]], [[D]]
322
323; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
324; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
325; SI: s_endpgm
326define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
327  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
328  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
329  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
330  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
331  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
332  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
333  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
334
335  %a = load volatile float, float addrspace(1)* %gep.0
336  %b = load volatile float, float addrspace(1)* %gep.1
337  %c = load volatile float, float addrspace(1)* %gep.2
338  %d = load volatile float, float addrspace(1)* %gep.3
339
340  %mul = fmul float %a, %b
341  %mul.neg = fsub float -0.0, %mul
342  %fma0 = fsub float %mul.neg, %c
343  %fma1 = fsub float %mul.neg, %d
344
345  store volatile float %fma0, float addrspace(1)* %gep.out.0
346  store volatile float %fma1, float addrspace(1)* %gep.out.1
347  ret void
348}
349
350; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
351; FUNC-LABEL: {{^}}combine_to_mad_fsub_2_f32_2uses_mul:
352; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
353; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
354; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
355; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
356
357; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
358; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
359
360; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
361; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
362
363; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
364; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
365; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
366
367; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
368; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
369; SI: s_endpgm
370define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
371  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
372  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
373  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
374  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
375  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
376  %gep.out.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
377  %gep.out.1 = getelementptr float, float addrspace(1)* %gep.out.0, i32 1
378
379  %a = load volatile float, float addrspace(1)* %gep.0
380  %b = load volatile float, float addrspace(1)* %gep.1
381  %c = load volatile float, float addrspace(1)* %gep.2
382  %d = load volatile float, float addrspace(1)* %gep.3
383
384  %mul = fmul float %a, %b
385  %mul.neg = fsub float -0.0, %mul
386  %fma0 = fsub float %mul.neg, %c
387  %fma1 = fsub float %mul, %d
388
389  store volatile float %fma0, float addrspace(1)* %gep.out.0
390  store volatile float %fma1, float addrspace(1)* %gep.out.1
391  ret void
392}
393
394; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
395
396; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_0_f32:
397; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
398; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
399; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
400; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
401; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
402
403; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
404; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
405; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
406
407; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
408; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
409; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
410
411; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
412define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
413  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
414  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
415  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
416  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
417  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
418  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
419  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
420
421  %x = load volatile float, float addrspace(1)* %gep.0
422  %y = load volatile float, float addrspace(1)* %gep.1
423  %z = load volatile float, float addrspace(1)* %gep.2
424  %u = load volatile float, float addrspace(1)* %gep.3
425  %v = load volatile float, float addrspace(1)* %gep.4
426
427  %tmp0 = fmul float %u, %v
428  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
429  %tmp2 = fsub float %tmp1, %z
430
431  store float %tmp2, float addrspace(1)* %gep.out
432  ret void
433}
434
435; fold (fsub x, (fma y, z, (fmul u, v)))
436;   -> (fma (fneg y), z, (fma (fneg u), v, x))
437
438; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_1_f32:
439; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
440; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
441; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
442; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
443; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
444
445; SI-STD: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
446; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
447; SI-STD: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
448
449; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
450; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
451; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
452
453; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
454; SI: s_endpgm
455define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
456  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
457  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
458  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
459  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
460  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
461  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
462  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
463
464  %x = load volatile float, float addrspace(1)* %gep.0
465  %y = load volatile float, float addrspace(1)* %gep.1
466  %z = load volatile float, float addrspace(1)* %gep.2
467  %u = load volatile float, float addrspace(1)* %gep.3
468  %v = load volatile float, float addrspace(1)* %gep.4
469
470  %tmp0 = fmul float %u, %v
471  %tmp1 = call float @llvm.fma.f32(float %y, float %z, float %tmp0) #0
472  %tmp2 = fsub float %x, %tmp1
473
474  store float %tmp2, float addrspace(1)* %gep.out
475  ret void
476}
477
478; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
479
480; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_2_f32:
481; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
482; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
483; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
484; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
485; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
486
487; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
488; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[A]], [[B]]
489; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[C]]
490
491; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
492; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[A]], [[B]]
493
494; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
495; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
496; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]],  [[TMP1]], [[C]]
497
498; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
499; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[A]], [[B]]
500; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
501; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP2]], [[C]]
502
503; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
504; SI: s_endpgm
505define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
506  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
507  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
508  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
509  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
510  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
511  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
512  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
513
514  %x = load volatile float, float addrspace(1)* %gep.0
515  %y = load volatile float, float addrspace(1)* %gep.1
516  %z = load volatile float, float addrspace(1)* %gep.2
517  %u = load volatile float, float addrspace(1)* %gep.3
518  %v = load volatile float, float addrspace(1)* %gep.4
519
520  %tmp0 = fmul float %u, %v
521  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
522  %tmp2 = fsub float %tmp1, %z
523
524  store float %tmp2, float addrspace(1)* %gep.out
525  ret void
526}
527
528; fold (fsub x, (fmuladd y, z, (fmul u, v)))
529;   -> (fmuladd (fneg y), z, (fmuladd (fneg u), v, x))
530
531; FUNC-LABEL: {{^}}aggressive_combine_to_mad_fsub_3_f32:
532; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
533; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
534; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
535; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
536; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
537
538; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
539; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[C]]
540; SI-STD-SAFE: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP0]]
541
542; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
543; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
544
545; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
546; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
547; SI-DENORM-FASTFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP1]]
548
549; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[D]], [[E]]
550; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[C]]
551; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP1]], [[TMP0]]
552; SI-DENORM-SLOWFMAF: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[A]], [[TMP2]]
553
554; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
555; SI: s_endpgm
556define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
557  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
558  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
559  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
560  %gep.2 = getelementptr float, float addrspace(1)* %gep.0, i32 2
561  %gep.3 = getelementptr float, float addrspace(1)* %gep.0, i32 3
562  %gep.4 = getelementptr float, float addrspace(1)* %gep.0, i32 4
563  %gep.out = getelementptr float, float addrspace(1)* %out, i32 %tid
564
565  %x = load volatile float, float addrspace(1)* %gep.0
566  %y = load volatile float, float addrspace(1)* %gep.1
567  %z = load volatile float, float addrspace(1)* %gep.2
568  %u = load volatile float, float addrspace(1)* %gep.3
569  %v = load volatile float, float addrspace(1)* %gep.4
570
571  ; nsz flag is needed since this combine may change sign of zero
572  %tmp0 = fmul nsz float %u, %v
573  %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
574  %tmp2 = fsub nsz float %x, %tmp1
575
576  store float %tmp2, float addrspace(1)* %gep.out
577  ret void
578}
579
580attributes #0 = { nounwind readnone }
581attributes #1 = { nounwind }
582