1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
5
6declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
7declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
8
9; GCN-LABEL: {{^}}fmuladd_f16
10; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
11; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
12; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
13; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
14; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
15; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
16; SI:  v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
17; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
18; SI:  buffer_store_short v[[R_F16]]
19
20; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
21; VI-FLUSH: buffer_store_short v[[C_F16]]
22
23; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
24; VI-DENORM: buffer_store_short [[RESULT]]
25
26; GCN: s_endpgm
27define amdgpu_kernel void @fmuladd_f16(
28    half addrspace(1)* %r,
29    half addrspace(1)* %a,
30    half addrspace(1)* %b,
31    half addrspace(1)* %c) {
32  %a.val = load half, half addrspace(1)* %a
33  %b.val = load half, half addrspace(1)* %b
34  %c.val = load half, half addrspace(1)* %c
35  %r.val = call half @llvm.fmuladd.f16(half %a.val, half %b.val, half %c.val)
36  store half %r.val, half addrspace(1)* %r
37  ret void
38}
39
40; GCN-LABEL: {{^}}fmuladd_f16_imm_a
41; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
42; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
43; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
44; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
45; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]]
46; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
47; SI:  buffer_store_short v[[R_F16]]
48
49; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
50; VI-FLUSH: buffer_store_short v[[C_F16]]
51
52; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
53; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
54; VI-DENORM: buffer_store_short [[RESULT]]
55
56; GCN: s_endpgm
57define amdgpu_kernel void @fmuladd_f16_imm_a(
58    half addrspace(1)* %r,
59    half addrspace(1)* %b,
60    half addrspace(1)* %c) {
61  %b.val = load volatile half, half addrspace(1)* %b
62  %c.val = load volatile half, half addrspace(1)* %c
63  %r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
64  store half %r.val, half addrspace(1)* %r
65  ret void
66}
67
68; GCN-LABEL: {{^}}fmuladd_f16_imm_b
69; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
70; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
71; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
72; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
73; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]]
74; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
75; SI:  buffer_store_short v[[R_F16]]
76
77; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
78; VI-FLUSH: buffer_store_short v[[C_F16]]
79
80; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
81; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
82; VI-DENORM buffer_store_short [[RESULT]]
83
84
85; GCN: s_endpgm
86define amdgpu_kernel void @fmuladd_f16_imm_b(
87    half addrspace(1)* %r,
88    half addrspace(1)* %a,
89    half addrspace(1)* %c) {
90  %a.val = load volatile half, half addrspace(1)* %a
91  %c.val = load volatile half, half addrspace(1)* %c
92  %r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
93  store half %r.val, half addrspace(1)* %r
94  ret void
95}
96
97; GCN-LABEL: {{^}}fmuladd_v2f16
98; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
99; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
100; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
101
102; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
103; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
104; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
105
106; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
107; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
108; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
109
110
111; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
112; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
113; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
114; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
115
116; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
117; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
118
119
120; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
121; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
122; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
123; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
124; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
125; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
126; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
127; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
128; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
129
130
131; VI-FLUSH:     v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
132; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
133; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
134; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
135; VI-FLUSH-NOT: v_and_b32
136; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
137
138; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
139; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
140; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
141; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
142; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
143; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
144; VI-DENORM-NOT: v_and_b32
145; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
146
147; GCN: buffer_store_dword v[[R_V2_F16]]
148define amdgpu_kernel void @fmuladd_v2f16(
149    <2 x half> addrspace(1)* %r,
150    <2 x half> addrspace(1)* %a,
151    <2 x half> addrspace(1)* %b,
152    <2 x half> addrspace(1)* %c) {
153  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
154  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
155  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
156  %r.val = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
157  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
158  ret void
159}
160