1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3
4declare half @llvm.fma.f16(half %a, half %b, half %c)
5declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
6
7; GCN-LABEL: {{^}}fma_f16
8; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
10; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
11; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
12; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
13; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
14; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
15; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
16; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
17; GCN: buffer_store_short v[[R_F16]]
18; GCN: s_endpgm
19define amdgpu_kernel void @fma_f16(
20    half addrspace(1)* %r,
21    half addrspace(1)* %a,
22    half addrspace(1)* %b,
23    half addrspace(1)* %c) {
24  %a.val = load half, half addrspace(1)* %a
25  %b.val = load half, half addrspace(1)* %b
26  %c.val = load half, half addrspace(1)* %c
27  %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half %c.val)
28  store half %r.val, half addrspace(1)* %r
29  ret void
30}
31
32; GCN-LABEL: {{^}}fma_f16_imm_a
33; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
34; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
35
36; SI:  v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
37; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
38; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
39; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
40; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
41; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
42; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
43; GCN: buffer_store_short v[[R_F16]]
44; GCN: s_endpgm
45define amdgpu_kernel void @fma_f16_imm_a(
46    half addrspace(1)* %r,
47    half addrspace(1)* %b,
48    half addrspace(1)* %c) {
49  %b.val = load half, half addrspace(1)* %b
50  %c.val = load half, half addrspace(1)* %c
51  %r.val = call half @llvm.fma.f16(half 3.0, half %b.val, half %c.val)
52  store half %r.val, half addrspace(1)* %r
53  ret void
54}
55
56; GCN-LABEL: {{^}}fma_f16_imm_b
57; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
58; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
59; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
60; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
61; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
62; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
63; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
64; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
65; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
66; GCN: buffer_store_short v[[R_F16]]
67; GCN: s_endpgm
68define amdgpu_kernel void @fma_f16_imm_b(
69    half addrspace(1)* %r,
70    half addrspace(1)* %a,
71    half addrspace(1)* %c) {
72  %a.val = load half, half addrspace(1)* %a
73  %c.val = load half, half addrspace(1)* %c
74  %r.val = call half @llvm.fma.f16(half %a.val, half 3.0, half %c.val)
75  store half %r.val, half addrspace(1)* %r
76  ret void
77}
78
79; GCN-LABEL: {{^}}fma_f16_imm_c
80; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
81; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
82; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
83; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
84; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
85; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
86; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
87; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
88; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
89; GCN: buffer_store_short v[[R_F16]]
90; GCN: s_endpgm
91define amdgpu_kernel void @fma_f16_imm_c(
92    half addrspace(1)* %r,
93    half addrspace(1)* %a,
94    half addrspace(1)* %b) {
95  %a.val = load half, half addrspace(1)* %a
96  %b.val = load half, half addrspace(1)* %b
97  %r.val = call half @llvm.fma.f16(half %a.val, half %b.val, half 3.0)
98  store half %r.val, half addrspace(1)* %r
99  ret void
100}
101
102; GCN-LABEL: {{^}}fma_v2f16
103; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
104; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
105; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
106
107; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
108; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
109; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
110; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
111; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
112; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
113; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
114
115; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
116; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
117
118
119; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
120; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
121; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
122; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
123
124; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
125; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
126; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
127; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
128; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
129
130; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
131; GCN-NOT: and
132; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
133; GCN: buffer_store_dword v[[R_V2_F16]]
134; GCN: s_endpgm
135define amdgpu_kernel void @fma_v2f16(
136    <2 x half> addrspace(1)* %r,
137    <2 x half> addrspace(1)* %a,
138    <2 x half> addrspace(1)* %b,
139    <2 x half> addrspace(1)* %c) {
140  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
141  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
142  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
143  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> %c.val)
144  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
145  ret void
146}
147
148; GCN-LABEL: {{^}}fma_v2f16_imm_a:
149; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
150; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
151
152
153; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
154; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
155
156
157; SI:  v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
158; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
159; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
160; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
161
162; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
163; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
164; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
165; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
166
167; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
168; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
169; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
170; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
171
172; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
173; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]]
174
175; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
176; GCN-NOT: and
177; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
178; GCN: buffer_store_dword v[[R_V2_F16]]
179; GCN: s_endpgm
180define amdgpu_kernel void @fma_v2f16_imm_a(
181    <2 x half> addrspace(1)* %r,
182    <2 x half> addrspace(1)* %b,
183    <2 x half> addrspace(1)* %c) {
184  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
185  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
186  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> <half 3.0, half 3.0>, <2 x half> %b.val, <2 x half> %c.val)
187  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
188  ret void
189}
190
191; GCN-LABEL: {{^}}fma_v2f16_imm_b:
192; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
193; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
194
195; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
196; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
197
198; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
199; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
200
201; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
202; SI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
203; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
204; SI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
205
206; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
207; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
208; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
209; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
210; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
211; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
212
213; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
214; VI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
215; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
216; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
217
218; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
219; GCN-NOT: and
220; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
221; GCN: buffer_store_dword v[[R_V2_F16]]
222; GCN: s_endpgm
223define amdgpu_kernel void @fma_v2f16_imm_b(
224    <2 x half> addrspace(1)* %r,
225    <2 x half> addrspace(1)* %a,
226    <2 x half> addrspace(1)* %c) {
227  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
228  %c.val = load <2 x half>, <2 x half> addrspace(1)* %c
229  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> <half 3.0, half 3.0>, <2 x half> %c.val)
230  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
231  ret void
232}
233
234; GCN-LABEL: {{^}}fma_v2f16_imm_c:
235; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
236; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
237
238; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
239; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
240
241; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
242; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
243
244; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
245; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
246
247; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
248; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
249
250; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
251; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
252
253; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
254; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
255; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
256; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
257; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
258; GCN-NOT: and
259; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
260
261; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
262; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
263; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
264; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
265; GCN-NOT: and
266; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
267
268
269; GCN: buffer_store_dword v[[R_V2_F16]]
270; GCN: s_endpgm
271define amdgpu_kernel void @fma_v2f16_imm_c(
272    <2 x half> addrspace(1)* %r,
273    <2 x half> addrspace(1)* %a,
274    <2 x half> addrspace(1)* %b) {
275  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
276  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
277  %r.val = call <2 x half> @llvm.fma.v2f16(<2 x half> %a.val, <2 x half> %b.val, <2 x half> <half 3.0, half 3.0>)
278  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
279  ret void
280}
281