1; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
5; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
6
7declare i32 @llvm.amdgcn.workitem.id.x() #1
8declare half @llvm.fmuladd.f16(half, half, half) #1
9declare half @llvm.fabs.f16(half) #1
10
11; GCN-LABEL: {{^}}fmuladd_f16:
12; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
13
14; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
15define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
16                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
17  %r0 = load half, half addrspace(1)* %in1
18  %r1 = load half, half addrspace(1)* %in2
19  %r2 = load half, half addrspace(1)* %in3
20  %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
21  store half %r3, half addrspace(1)* %out
22  ret void
23}
24
25; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
26; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
27; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
28; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
29; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
30
31; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
32; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
33define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
34  %tid = call i32 @llvm.amdgcn.workitem.id.x()
35  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
36  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
37  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
38
39  %r1 = load volatile half, half addrspace(1)* %gep.0
40  %r2 = load volatile half, half addrspace(1)* %gep.1
41
42  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
43  store half %r3, half addrspace(1)* %gep.out
44  ret void
45}
46
47; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
48; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
49; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
50; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
51; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
52
53; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
54; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
55define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
56  %tid = call i32 @llvm.amdgcn.workitem.id.x()
57  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
58  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
59  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
60
61  %r1 = load volatile half, half addrspace(1)* %gep.0
62  %r2 = load volatile half, half addrspace(1)* %gep.1
63
64  %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
65  store half %r3, half addrspace(1)* %gep.out
66  ret void
67}
68
69; GCN-LABEL: {{^}}fadd_a_a_b_f16:
70; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
71; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
72; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
73; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
74
75; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
76
77; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
78; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
79
80; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
81define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
82                            half addrspace(1)* %in1,
83                            half addrspace(1)* %in2) #0 {
84  %tid = call i32 @llvm.amdgcn.workitem.id.x()
85  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
86  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
87  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
88
89  %r0 = load volatile half, half addrspace(1)* %gep.0
90  %r1 = load volatile half, half addrspace(1)* %gep.1
91
92  %add.0 = fadd half %r0, %r0
93  %add.1 = fadd half %add.0, %r1
94  store half %add.1, half addrspace(1)* %gep.out
95  ret void
96}
97
98; GCN-LABEL: {{^}}fadd_b_a_a_f16:
99; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
100; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
101; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
102; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
103
104; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
105
106; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
107; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]],  [[R2]], [[TMP]]
108
109; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
110define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
111                            half addrspace(1)* %in1,
112                            half addrspace(1)* %in2) #0 {
113  %tid = call i32 @llvm.amdgcn.workitem.id.x()
114  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
115  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
116  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
117
118  %r0 = load volatile half, half addrspace(1)* %gep.0
119  %r1 = load volatile half, half addrspace(1)* %gep.1
120
121  %add.0 = fadd half %r0, %r0
122  %add.1 = fadd half %r1, %add.0
123  store half %add.1, half addrspace(1)* %gep.out
124  ret void
125}
126
127; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
128; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
129; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
130; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
131; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
132; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
133define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
136  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
137  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
138
139  %r1 = load volatile half, half addrspace(1)* %gep.0
140  %r2 = load volatile half, half addrspace(1)* %gep.1
141
142  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
143  store half %r3, half addrspace(1)* %gep.out
144  ret void
145}
146
147; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
148; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
149; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
150; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
151; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
152
153; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
154; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
155define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
156  %tid = call i32 @llvm.amdgcn.workitem.id.x()
157  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
158  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
159  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
160
161  %r1 = load volatile half, half addrspace(1)* %gep.0
162  %r2 = load volatile half, half addrspace(1)* %gep.1
163
164  %r1.fneg = fsub half -0.000000e+00, %r1
165
166  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
167  store half %r3, half addrspace(1)* %gep.out
168  ret void
169}
170
171; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
172; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
173; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
174; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
175; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
176
177; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
178; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
179define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
180  %tid = call i32 @llvm.amdgcn.workitem.id.x()
181  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
182  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
183  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
184
185  %r1 = load volatile half, half addrspace(1)* %gep.0
186  %r2 = load volatile half, half addrspace(1)* %gep.1
187
188  %r1.fneg = fsub half -0.000000e+00, %r1
189
190  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
191  store half %r3, half addrspace(1)* %gep.out
192  ret void
193}
194
195; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
196; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
197; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
198; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
199; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
200; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
201define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
204  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
205  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
206
207  %r1 = load volatile half, half addrspace(1)* %gep.0
208  %r2 = load volatile half, half addrspace(1)* %gep.1
209
210  %r2.fneg = fsub half -0.000000e+00, %r2
211
212  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
213  store half %r3, half addrspace(1)* %gep.out
214  ret void
215}
216
217; GCN-LABEL: {{^}}mad_sub_f16:
218; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
219; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
220; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
221
222; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
223
224; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
225
226; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
227; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
228
229; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
230define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
231  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
232  %tid.ext = sext i32 %tid to i64
233  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
234  %add1 = add i64 %tid.ext, 1
235  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
236  %add2 = add i64 %tid.ext, 2
237  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
238  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
239  %a = load volatile half, half addrspace(1)* %gep0, align 2
240  %b = load volatile half, half addrspace(1)* %gep1, align 2
241  %c = load volatile half, half addrspace(1)* %gep2, align 2
242  %mul = fmul half %a, %b
243  %sub = fsub half %mul, %c
244  store half %sub, half addrspace(1)* %outgep, align 2
245  ret void
246}
247
248; GCN-LABEL: {{^}}mad_sub_inv_f16:
249; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
250; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
251; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
252; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
253
254; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
255
256; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
257; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
258
259; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
260define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
261  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
262  %tid.ext = sext i32 %tid to i64
263  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
264  %add1 = add i64 %tid.ext, 1
265  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
266  %add2 = add i64 %tid.ext, 2
267  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
268  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
269  %a = load volatile half, half addrspace(1)* %gep0, align 2
270  %b = load volatile half, half addrspace(1)* %gep1, align 2
271  %c = load volatile half, half addrspace(1)* %gep2, align 2
272  %mul = fmul half %a, %b
273  %sub = fsub half %c, %mul
274  store half %sub, half addrspace(1)* %outgep, align 2
275  ret void
276}
277
278; GCN-LABEL: {{^}}mad_sub_fabs_f16:
279; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
280; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
281; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
282; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
283
284; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
285
286; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
287; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
288
289; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
290define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
291  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
292  %tid.ext = sext i32 %tid to i64
293  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
294  %add1 = add i64 %tid.ext, 1
295  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
296  %add2 = add i64 %tid.ext, 2
297  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
298  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
299  %a = load volatile half, half addrspace(1)* %gep0, align 2
300  %b = load volatile half, half addrspace(1)* %gep1, align 2
301  %c = load volatile half, half addrspace(1)* %gep2, align 2
302  %c.abs = call half @llvm.fabs.f16(half %c) #0
303  %mul = fmul half %a, %b
304  %sub = fsub half %mul, %c.abs
305  store half %sub, half addrspace(1)* %outgep, align 2
306  ret void
307}
308
309; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
310; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
311; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
312; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
313
314; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
315
316; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
317
318; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
319; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
320
321; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
322define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
323  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
324  %tid.ext = sext i32 %tid to i64
325  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
326  %add1 = add i64 %tid.ext, 1
327  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
328  %add2 = add i64 %tid.ext, 2
329  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
330  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
331  %a = load volatile half, half addrspace(1)* %gep0, align 2
332  %b = load volatile half, half addrspace(1)* %gep1, align 2
333  %c = load volatile half, half addrspace(1)* %gep2, align 2
334  %c.abs = call half @llvm.fabs.f16(half %c) #0
335  %mul = fmul half %a, %b
336  %sub = fsub half %c.abs, %mul
337  store half %sub, half addrspace(1)* %outgep, align 2
338  ret void
339}
340
341; GCN-LABEL: {{^}}neg_neg_mad_f16:
342; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
343; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
344; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
345
346; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
347; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
348
349; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
350
351; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
352; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
353; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
354define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
355  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
356  %tid.ext = sext i32 %tid to i64
357  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
358  %add1 = add i64 %tid.ext, 1
359  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
360  %add2 = add i64 %tid.ext, 2
361  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
362  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
363  %a = load volatile half, half addrspace(1)* %gep0, align 2
364  %b = load volatile half, half addrspace(1)* %gep1, align 2
365  %c = load volatile half, half addrspace(1)* %gep2, align 2
366  %nega = fsub half -0.000000e+00, %a
367  %negb = fsub half -0.000000e+00, %b
368  %mul = fmul half %nega, %negb
369  %sub = fadd half %mul, %c
370  store half %sub, half addrspace(1)* %outgep, align 2
371  ret void
372}
373
374; GCN-LABEL: {{^}}mad_fabs_sub_f16:
375; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
376; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
377; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
378
379; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
380
381; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
382
383; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
384; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
385
386; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
387define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
388  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
389  %tid.ext = sext i32 %tid to i64
390  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
391  %add1 = add i64 %tid.ext, 1
392  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
393  %add2 = add i64 %tid.ext, 2
394  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
395  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
396  %a = load volatile half, half addrspace(1)* %gep0, align 2
397  %b = load volatile half, half addrspace(1)* %gep1, align 2
398  %c = load volatile half, half addrspace(1)* %gep2, align 2
399  %b.abs = call half @llvm.fabs.f16(half %b) #0
400  %mul = fmul half %a, %b.abs
401  %sub = fsub half %mul, %c
402  store half %sub, half addrspace(1)* %outgep, align 2
403  ret void
404}
405
406; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
407; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
408; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
409; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
410; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
411
412; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
413
414; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
415; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
416
417; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
418define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
419  %tid = call i32 @llvm.amdgcn.workitem.id.x()
420  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
421  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
422  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
423
424  %r1 = load volatile half, half addrspace(1)* %gep.0
425  %r2 = load volatile half, half addrspace(1)* %gep.1
426
427  %add = fadd half %r1, %r1
428  %r3 = fsub half %r2, %add
429
430  store half %r3, half addrspace(1)* %gep.out
431  ret void
432}
433
434; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
435; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
436; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
437
438; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
439
440; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
441
442; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
443; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
444
445; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
446define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
447  %tid = call i32 @llvm.amdgcn.workitem.id.x()
448  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
449  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
450  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
451
452  %r1 = load volatile half, half addrspace(1)* %gep.0
453  %r2 = load volatile half, half addrspace(1)* %gep.1
454
455  %add = fadd half %r1, %r1
456  %r3 = fsub half %add, %r2
457
458  store half %r3, half addrspace(1)* %gep.out
459  ret void
460}
461
462attributes #0 = { nounwind }
463attributes #1 = { nounwind readnone }
464