1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
5
6; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
8; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
9; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
10
11declare i32 @llvm.amdgcn.workitem.id.x() #1
12declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
13declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
14
15; GCN-LABEL: {{^}}fmuladd_v2f16:
16; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
17; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
18
19; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
20define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
21                         <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
22  %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
23  %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
24  %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
25  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2)
26  store <2 x half> %r3, <2 x half> addrspace(1)* %out
27  ret void
28}
29
30; GCN-LABEL: {{^}}fmul_fadd_v2f16:
31; GFX9-DENORM-STRICT: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
32; GFX9-DENORM-STRICT: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
33
34; GFX9-DENORM-CONTRACT: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
35define amdgpu_kernel void @fmul_fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
36                         <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
37  %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
38  %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
39  %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
40  %r3 = fmul <2 x half> %r0, %r1
41  %r4 = fadd <2 x half> %r3, %r2
42  store <2 x half> %r4, <2 x half> addrspace(1)* %out
43  ret void
44}
45
46; GCN-LABEL: {{^}}fmul_fadd_contract_v2f16:
47; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
48; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
49
50; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
51define amdgpu_kernel void @fmul_fadd_contract_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
52                         <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
53  %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
54  %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
55  %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
56  %r3 = fmul <2 x half> %r0, %r1
57  %r4 = fadd contract <2 x half> %r3, %r2
58  store <2 x half> %r4, <2 x half> addrspace(1)* %out
59  ret void
60}
61
62
63; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
64; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
65; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
66; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
67; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
68
69; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
70
71; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
72; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
73define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
74  %tid = call i32 @llvm.amdgcn.workitem.id.x()
75  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
76  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
77  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
78
79  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
80  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
81
82  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2)
83  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
84  ret void
85}
86
87; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16:
88; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
89; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
90; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
91; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
92
93; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
94
95; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
96; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
97define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
98  %tid = call i32 @llvm.amdgcn.workitem.id.x()
99  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
100  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
101  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
102
103  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
104  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
105
106  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2)
107  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
108  ret void
109}
110
111; GCN-LABEL: {{^}}fadd_a_a_b_v2f16:
112; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]],
113; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]],
114; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
115; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
116
117; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
118; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
119
120; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
121
122; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
123define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
124                            <2 x half> addrspace(1)* %in1,
125                            <2 x half> addrspace(1)* %in2) #0 {
126  %tid = call i32 @llvm.amdgcn.workitem.id.x()
127  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
128  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
129  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
130
131  %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
132  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
133
134  %add.0 = fadd <2 x half> %r0, %r0
135  %add.1 = fadd <2 x half> %add.0, %r1
136  store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out
137  ret void
138}
139
140attributes #0 = { nounwind }
141attributes #1 = { nounwind readnone }
142