1; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
3
4; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
5; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
6
7; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
9; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
10; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
11
12declare i32 @llvm.amdgcn.workitem.id.x() #1
13declare half @llvm.fmuladd.f16(half, half, half) #1
14declare half @llvm.fabs.f16(half) #1
15
16; GCN-LABEL: {{^}}fmuladd_f16:
17; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
18
19; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
20
21; GFX10-FLUSH:  v_mul_f16_e32
22; GFX10-FLUSH:  v_add_f16_e32
23; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
24
25define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
26                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
27  %r0 = load half, half addrspace(1)* %in1
28  %r1 = load half, half addrspace(1)* %in2
29  %r2 = load half, half addrspace(1)* %in3
30  %r3 = tail call half @llvm.fmuladd.f16(half %r0, half %r1, half %r2)
31  store half %r3, half addrspace(1)* %out
32  ret void
33}
34
35; GCN-LABEL: {{^}}fmul_fadd_f16:
36; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
37
38; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
39
40; GFX10-FLUSH:  v_mul_f16_e32
41; GFX10-FLUSH:  v_add_f16_e32
42; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
43
44define amdgpu_kernel void @fmul_fadd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
45                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
46  %r0 = load half, half addrspace(1)* %in1
47  %r1 = load half, half addrspace(1)* %in2
48  %r2 = load half, half addrspace(1)* %in3
49  %mul = fmul half %r0, %r1
50  %add = fadd half %mul, %r2
51  store half %add, half addrspace(1)* %out
52  ret void
53}
54
55; GCN-LABEL: {{^}}fmul_fadd_contract_f16:
56; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
57
58; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
59
60; GFX10-FLUSH:  v_mul_f16_e32
61; GFX10-FLUSH:  v_add_f16_e32
62; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
63
64define amdgpu_kernel void @fmul_fadd_contract_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
65                         half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
66  %r0 = load half, half addrspace(1)* %in1
67  %r1 = load half, half addrspace(1)* %in2
68  %r2 = load half, half addrspace(1)* %in3
69  %mul = fmul half %r0, %r1
70  %add = fadd contract half %mul, %r2
71  store half %add, half addrspace(1)* %out
72  ret void
73}
74
75; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
76; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
77; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
78; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
79; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
80
81; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
82; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
83
84; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
85; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
86
87; VI-DENORM:    flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
88; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
89; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
90
91define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
92  %tid = call i32 @llvm.amdgcn.workitem.id.x()
93  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
94  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
95  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
96
97  %r1 = load volatile half, half addrspace(1)* %gep.0
98  %r2 = load volatile half, half addrspace(1)* %gep.1
99
100  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2)
101  store half %r3, half addrspace(1)* %gep.out
102  ret void
103}
104
105; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
106; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
107; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
108; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
109; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
110
111; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
112; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
113
114; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
115; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
116
117; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
118; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
119; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
120
121define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
122  %tid = call i32 @llvm.amdgcn.workitem.id.x()
123  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
124  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
125  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
126
127  %r1 = load volatile half, half addrspace(1)* %gep.0
128  %r2 = load volatile half, half addrspace(1)* %gep.1
129
130  %r3 = tail call half @llvm.fmuladd.f16(half %r1, half 2.0, half %r2)
131  store half %r3, half addrspace(1)* %gep.out
132  ret void
133}
134
135; GCN-LABEL: {{^}}fadd_a_a_b_f16:
136; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
137; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
138; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
139; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
140
141; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
142; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
143
144; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
145; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
146
147; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
148
149; GFX10-FLUSH:           v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
150; GFX10-FLUSH:           v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
151; GFX10-FLUSH:           global_store_short v{{[0-9]+}}, [[RESULT]]
152; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
153; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
154
155define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
156                            half addrspace(1)* %in1,
157                            half addrspace(1)* %in2) #0 {
158  %tid = call i32 @llvm.amdgcn.workitem.id.x()
159  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
160  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
161  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
162
163  %r0 = load volatile half, half addrspace(1)* %gep.0
164  %r1 = load volatile half, half addrspace(1)* %gep.1
165
166  %add.0 = fadd half %r0, %r0
167  %add.1 = fadd half %add.0, %r1
168  store half %add.1, half addrspace(1)* %gep.out
169  ret void
170}
171
172; GCN-LABEL: {{^}}fadd_b_a_a_f16:
173; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
174; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
175; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
176; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
177
178; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
179; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
180
181; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
182; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]],  [[R2]], [[TMP]]
183
184; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
185
186; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
187; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
188; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
189; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
190; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
191
192define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
193                            half addrspace(1)* %in1,
194                            half addrspace(1)* %in2) #0 {
195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
196  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
197  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
198  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
199
200  %r0 = load volatile half, half addrspace(1)* %gep.0
201  %r1 = load volatile half, half addrspace(1)* %gep.1
202
203  %add.0 = fadd half %r0, %r0
204  %add.1 = fadd half %r1, %add.0
205  store half %add.1, half addrspace(1)* %gep.out
206  ret void
207}
208
209; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
210; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
211; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
212; VI-FLUSH:     v_mac_f16_e32 [[R2]], -2.0, [[R1]]
213; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
214; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
215; VI-FLUSH:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
216; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
217; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
218; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
219; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
220; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
221define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
222  %tid = call i32 @llvm.amdgcn.workitem.id.x()
223  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
224  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
225  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
226
227  %r1 = load volatile half, half addrspace(1)* %gep.0
228  %r2 = load volatile half, half addrspace(1)* %gep.1
229
230  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1, half %r2)
231  store half %r3, half addrspace(1)* %gep.out
232  ret void
233}
234
235; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
236; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
237; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
238; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
239; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
240
241; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
242; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
243
244; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
245; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
246; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
247
248; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
249; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
250define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
251  %tid = call i32 @llvm.amdgcn.workitem.id.x()
252  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
253  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
254  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
255
256  %r1 = load volatile half, half addrspace(1)* %gep.0
257  %r2 = load volatile half, half addrspace(1)* %gep.1
258
259  %r1.fneg = fneg half %r1
260
261  %r3 = tail call half @llvm.fmuladd.f16(half -2.0, half %r1.fneg, half %r2)
262  store half %r3, half addrspace(1)* %gep.out
263  ret void
264}
265
266; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
267; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
268; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
269; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
270; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
271
272; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
273; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
274
275; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
276; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
277; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
278
279; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
280; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]]
281define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
282  %tid = call i32 @llvm.amdgcn.workitem.id.x()
283  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
284  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
285  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
286
287  %r1 = load volatile half, half addrspace(1)* %gep.0
288  %r2 = load volatile half, half addrspace(1)* %gep.1
289
290  %r1.fneg = fneg half %r1
291
292  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1.fneg, half %r2)
293  store half %r3, half addrspace(1)* %gep.out
294  ret void
295}
296
297; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
298; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
299; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
300; VI-FLUSH:   v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
301; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
302; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
303; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
304; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
305; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
306define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
307  %tid = call i32 @llvm.amdgcn.workitem.id.x()
308  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
309  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
310  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
311
312  %r1 = load volatile half, half addrspace(1)* %gep.0
313  %r2 = load volatile half, half addrspace(1)* %gep.1
314
315  %r2.fneg = fneg half %r2
316
317  %r3 = tail call half @llvm.fmuladd.f16(half 2.0, half %r1, half %r2.fneg)
318  store half %r3, half addrspace(1)* %gep.out
319  ret void
320}
321
322; GCN-LABEL: {{^}}mad_sub_f16:
323; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
324; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
325; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
326
327; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
328
329; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
330
331; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
332; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
333
334; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
335
336; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
337; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
338; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
339define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
340  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
341  %tid.ext = sext i32 %tid to i64
342  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
343  %add1 = add i64 %tid.ext, 1
344  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
345  %add2 = add i64 %tid.ext, 2
346  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
347  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
348  %a = load volatile half, half addrspace(1)* %gep0, align 2
349  %b = load volatile half, half addrspace(1)* %gep1, align 2
350  %c = load volatile half, half addrspace(1)* %gep2, align 2
351  %mul = fmul half %a, %b
352  %sub = fsub half %mul, %c
353  store half %sub, half addrspace(1)* %outgep, align 2
354  ret void
355}
356
357; GCN-LABEL: {{^}}mad_sub_inv_f16:
358; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
359; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
360; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
361; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
362
363; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
364; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
365
366; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
367; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
368
369; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
370
371; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
372; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
373; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
374; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
375; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
376define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
377  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
378  %tid.ext = sext i32 %tid to i64
379  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
380  %add1 = add i64 %tid.ext, 1
381  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
382  %add2 = add i64 %tid.ext, 2
383  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
384  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
385  %a = load volatile half, half addrspace(1)* %gep0, align 2
386  %b = load volatile half, half addrspace(1)* %gep1, align 2
387  %c = load volatile half, half addrspace(1)* %gep2, align 2
388  %mul = fmul half %a, %b
389  %sub = fsub half %c, %mul
390  store half %sub, half addrspace(1)* %outgep, align 2
391  ret void
392}
393
394; GCN-LABEL: {{^}}mad_sub_fabs_f16:
395; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
396; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
397; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
398; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
399
400; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
401
402; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
403; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
404
405; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
406
407; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
408; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
409; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
410define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
411  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
412  %tid.ext = sext i32 %tid to i64
413  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
414  %add1 = add i64 %tid.ext, 1
415  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
416  %add2 = add i64 %tid.ext, 2
417  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
418  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
419  %a = load volatile half, half addrspace(1)* %gep0, align 2
420  %b = load volatile half, half addrspace(1)* %gep1, align 2
421  %c = load volatile half, half addrspace(1)* %gep2, align 2
422  %c.abs = call half @llvm.fabs.f16(half %c) #0
423  %mul = fmul half %a, %b
424  %sub = fsub half %mul, %c.abs
425  store half %sub, half addrspace(1)* %outgep, align 2
426  ret void
427}
428
429; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
430; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
431; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
432; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
433
434; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
435
436; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
437
438; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
439; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
440
441; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
442
443; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
444; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
445; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
446define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
447  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
448  %tid.ext = sext i32 %tid to i64
449  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
450  %add1 = add i64 %tid.ext, 1
451  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
452  %add2 = add i64 %tid.ext, 2
453  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
454  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
455  %a = load volatile half, half addrspace(1)* %gep0, align 2
456  %b = load volatile half, half addrspace(1)* %gep1, align 2
457  %c = load volatile half, half addrspace(1)* %gep2, align 2
458  %c.abs = call half @llvm.fabs.f16(half %c) #0
459  %mul = fmul half %a, %b
460  %sub = fsub half %c.abs, %mul
461  store half %sub, half addrspace(1)* %outgep, align 2
462  ret void
463}
464
465; GCN-LABEL: {{^}}neg_neg_mad_f16:
466; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
467; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
468; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
469
470; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
471; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
472
473; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
474; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
475
476; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
477; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
478; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
479
480; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
481; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
482; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
483; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
484; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
485define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
486  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
487  %tid.ext = sext i32 %tid to i64
488  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
489  %add1 = add i64 %tid.ext, 1
490  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
491  %add2 = add i64 %tid.ext, 2
492  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
493  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
494  %a = load volatile half, half addrspace(1)* %gep0, align 2
495  %b = load volatile half, half addrspace(1)* %gep1, align 2
496  %c = load volatile half, half addrspace(1)* %gep2, align 2
497  %nega = fneg half %a
498  %negb = fneg half %b
499  %mul = fmul half %nega, %negb
500  %sub = fadd half %mul, %c
501  store half %sub, half addrspace(1)* %outgep, align 2
502  ret void
503}
504
505; GCN-LABEL: {{^}}mad_fabs_sub_f16:
506; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
507; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
508; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
509
510; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
511
512; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
513
514; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
515; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
516
517; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
518
519; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
520; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
521; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
522define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
523  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
524  %tid.ext = sext i32 %tid to i64
525  %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
526  %add1 = add i64 %tid.ext, 1
527  %gep1 = getelementptr half, half addrspace(1)* %ptr, i64 %add1
528  %add2 = add i64 %tid.ext, 2
529  %gep2 = getelementptr half, half addrspace(1)* %ptr, i64 %add2
530  %outgep = getelementptr half, half addrspace(1)* %out, i64 %tid.ext
531  %a = load volatile half, half addrspace(1)* %gep0, align 2
532  %b = load volatile half, half addrspace(1)* %gep1, align 2
533  %c = load volatile half, half addrspace(1)* %gep2, align 2
534  %b.abs = call half @llvm.fabs.f16(half %b) #0
535  %mul = fmul half %a, %b.abs
536  %sub = fsub half %mul, %c
537  store half %sub, half addrspace(1)* %outgep, align 2
538  ret void
539}
540
541; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
542; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
543; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
544; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
545; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
546
547; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
548; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
549
550; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
551; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
552
553; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
554
555; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
556; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
557; GFX10-FLUSH:  global_store_short v{{[0-9]+}}, [[RESULT]]
558; GFX10-DENORM-STRICT:   global_store_short v{{[0-9]+}}, [[RESULT]]
559; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]]
560define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
561  %tid = call i32 @llvm.amdgcn.workitem.id.x()
562  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
563  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
564  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
565
566  %r1 = load volatile half, half addrspace(1)* %gep.0
567  %r2 = load volatile half, half addrspace(1)* %gep.1
568
569  %add = fadd half %r1, %r1
570  %r3 = fsub half %r2, %add
571
572  store half %r3, half addrspace(1)* %gep.out
573  ret void
574}
575
576; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
577; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
578; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
579
580; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
581
582; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
583
584; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
585; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
586
587; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
588
589; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
590; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
591; GFX10:       global_store_short v{{[0-9]+}}, [[RESULT]]
592define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
593  %tid = call i32 @llvm.amdgcn.workitem.id.x()
594  %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
595  %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
596  %gep.out = getelementptr half, half addrspace(1)* %out, i32 %tid
597
598  %r1 = load volatile half, half addrspace(1)* %gep.0
599  %r2 = load volatile half, half addrspace(1)* %gep.1
600
601  %add = fadd half %r1, %r1
602  %r3 = fsub half %add, %r2
603
604  store half %r3, half addrspace(1)* %gep.out
605  ret void
606}
607
608attributes #0 = { nounwind }
609attributes #1 = { nounwind readnone }
610