1; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX900
2; RUN: llc -march=amdgcn -mcpu=gfx906 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-UNSAFE
3; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906
4; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-CONTRACT
5; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s  -check-prefixes=GCN,GFX906-DENORM-CONTRACT
6; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
7
8; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
9; are not converted from f16 to f32.
10; GCN-LABEL: {{^}}dotproduct_f16
11; GFX900: v_fma_legacy_f16
12; GCN900: v_fma_legacy_f16
13
14; GFX906: v_mul_f16_e32
15; GFX906: v_mul_f16_e32
16
17; GFX906-UNSAFE:  v_fma_legacy_f16
18
19; GFX906-CONTRACT: v_mac_f16_e32
20; GFX906-DENORM-CONTRACT: v_fma_legacy_f16
21define amdgpu_kernel void @dotproduct_f16(<2 x half> addrspace(1)* %src1,
22                                          <2 x half> addrspace(1)* %src2,
23                                          half addrspace(1)* nocapture %dst) {
24entry:
25  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
26  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
27
28  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
29  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
30
31  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
32  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
33
34  %mul2 = fmul half %src1.el2, %src2.el2
35  %mul1 = fmul half %src1.el1, %src2.el1
36  %acc = load half, half addrspace(1)* %dst, align 2
37  %acc1 = fadd half %mul2, %acc
38  %acc2 = fadd half %mul1, %acc1
39  store half %acc2, half addrspace(1)* %dst, align 2
40  ret void
41}
42
43
44; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
45; and the vectors are of type <2 x half>
46; GCN-LABEL: {{^}}dotproduct_f16_f32
47; GFX900: v_mad_mix_f32
48; GCN900: v_mad_mix_f32
49
50; GFX906: v_mad_f32
51; GFX906: v_mac_f32_e32
52
53; GFX906-UNSAFE: v_dot2_f32_f16
54
55; GFX906-CONTRACT: v_dot2_f32_f16
56
57; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
58define amdgpu_kernel void @dotproduct_f16_f32(<2 x half> addrspace(1)* %src1,
59                                              <2 x half> addrspace(1)* %src2,
60                                              float addrspace(1)* nocapture %dst) {
61entry:
62  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
63  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
64
65  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
66  %csrc1.el1 = fpext half %src1.el1 to float
67  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
68  %csrc2.el1 = fpext half %src2.el1 to float
69
70  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
71  %csrc1.el2 = fpext half %src1.el2 to float
72  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
73  %csrc2.el2 = fpext half %src2.el2 to float
74
75  %mul2 = fmul float %csrc1.el2, %csrc2.el2
76  %mul1 = fmul float %csrc1.el1, %csrc2.el1
77  %acc = load float, float addrspace(1)* %dst, align 4
78  %acc1 = fadd float %mul2, %acc
79  %acc2 = fadd float %mul1, %acc1
80  store float %acc2, float addrspace(1)* %dst, align 4
81  ret void
82}
83
84; We only want to generate fdot2 if vector element of dot product is converted from f16 to f32
85; and the vectors are of type <2 x half>
86; GCN-LABEL: {{^}}dotproduct_diffvecorder
87; GFX900: v_mad_mix_f32
88; GCN900: v_mad_mix_f32
89
90; GFX906: v_mad_f32
91; GFX906: v_mac_f32_e32
92
93; GFX906-UNSAFE: v_dot2_f32_f16
94
95; GFX906-CONTRACT: v_dot2_f32_f16
96; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
97define amdgpu_kernel void @dotproduct_diffvecorder(<2 x half> addrspace(1)* %src1,
98                                                   <2 x half> addrspace(1)* %src2,
99                                                   float addrspace(1)* nocapture %dst) {
100entry:
101  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
102  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
103
104  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
105  %csrc1.el1 = fpext half %src1.el1 to float
106  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
107  %csrc2.el1 = fpext half %src2.el1 to float
108
109  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
110  %csrc1.el2 = fpext half %src1.el2 to float
111  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
112  %csrc2.el2 = fpext half %src2.el2 to float
113
114  %mul2 = fmul float %csrc2.el2, %csrc1.el2
115  %mul1 = fmul float %csrc1.el1, %csrc2.el1
116  %acc = load float, float addrspace(1)* %dst, align 4
117  %acc1 = fadd float %mul2, %acc
118  %acc2 = fadd float %mul1, %acc1
119  store float %acc2, float addrspace(1)* %dst, align 4
120  ret void
121}
122
123; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
124; GCN-LABEL: {{^}}dotproduct_v4f16
125; GFX900: v_mad_mix_f32
126
127; GFX906: v_mad_f32
128; GFX906: v_mac_f32_e32
129
130; GFX906-UNSAFE: v_fma_mix_f32
131
132; GFX906-CONTRACT: v_fma_mix_f32
133; GFX906-DENORM-CONTRACT: v_fma_mix_f32
134define amdgpu_kernel void @dotproduct_v4f16(<4 x half> addrspace(1)* %src1,
135                                            <4 x half> addrspace(1)* %src2,
136                                            float addrspace(1)* nocapture %dst) {
137entry:
138  %src1.vec = load <4 x half>, <4 x half> addrspace(1)* %src1
139  %src2.vec = load <4 x half>, <4 x half> addrspace(1)* %src2
140
141  %src1.el1 = extractelement <4 x half> %src1.vec, i64 0
142  %csrc1.el1 = fpext half %src1.el1 to float
143  %src2.el1 = extractelement <4 x half> %src2.vec, i64 0
144  %csrc2.el1 = fpext half %src2.el1 to float
145
146  %src1.el2 = extractelement <4 x half> %src1.vec, i64 1
147  %csrc1.el2 = fpext half %src1.el2 to float
148  %src2.el2 = extractelement <4 x half> %src2.vec, i64 1
149  %csrc2.el2 = fpext half %src2.el2 to float
150
151  %mul2 = fmul float %csrc1.el2, %csrc2.el2
152  %mul1 = fmul float %csrc1.el1, %csrc2.el1
153  %acc = load float, float addrspace(1)* %dst, align 4
154  %acc1 = fadd float %mul2, %acc
155  %acc2 = fadd float %mul1, %acc1
156  store float %acc2, float addrspace(1)* %dst, align 4
157  ret void
158}
159
160; GCN-LABEL: {{^}}NotAdotproduct
161; GFX900: v_mad_mix_f32
162; GCN900: v_mad_mix_f32
163
164; GFX906: v_mad_f32
165; GFX906: v_mac_f32_e32
166
167; GFX906-UNSAFE: v_fma_mix_f32
168
169; GFX906-CONTRACT: v_fma_mix_f32
170; GFX906-DENORM-CONTRACT: v_fma_mix_f32
171define amdgpu_kernel void @NotAdotproduct(<2 x half> addrspace(1)* %src1,
172                                          <2 x half> addrspace(1)* %src2,
173                                          float addrspace(1)* nocapture %dst) {
174entry:
175  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
176  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
177
178  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
179  %csrc1.el1 = fpext half %src1.el1 to float
180  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
181  %csrc2.el1 = fpext half %src2.el1 to float
182
183  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
184  %csrc1.el2 = fpext half %src1.el2 to float
185  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
186  %csrc2.el2 = fpext half %src2.el2 to float
187
188  %mul2 = fmul float %csrc1.el2, %csrc1.el1
189  %mul1 = fmul float %csrc2.el1, %csrc2.el2
190  %acc = load float, float addrspace(1)* %dst, align 4
191  %acc1 = fadd float %mul2, %acc
192  %acc2 = fadd float %mul1, %acc1
193  store float %acc2, float addrspace(1)* %dst, align 4
194  ret void
195}
196
197; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
198; GFX900: v_mad_mix_f32
199; GCN900: v_mad_mix_f32
200
201; GFX906: v_mad_f32
202; GFX906: v_mac_f32_e32
203
204; GFX906-UNSAFE: v_fma_mix_f32
205
206; GFX906-CONTRACT: v_fma_mix_f32
207; GFX906-DENORM-CONTRACT: v_fma_mix_f32
208define amdgpu_kernel void @Diff_Idx_NotAdotproduct(<2 x half> addrspace(1)* %src1,
209                                                   <2 x half> addrspace(1)* %src2,
210                                                   float addrspace(1)* nocapture %dst) {
211entry:
212  %src1.vec = load <2 x half>, <2 x half> addrspace(1)* %src1
213  %src2.vec = load <2 x half>, <2 x half> addrspace(1)* %src2
214
215  %src1.el1 = extractelement <2 x half> %src1.vec, i64 0
216  %csrc1.el1 = fpext half %src1.el1 to float
217  %src2.el1 = extractelement <2 x half> %src2.vec, i64 0
218  %csrc2.el1 = fpext half %src2.el1 to float
219
220  %src1.el2 = extractelement <2 x half> %src1.vec, i64 1
221  %csrc1.el2 = fpext half %src1.el2 to float
222  %src2.el2 = extractelement <2 x half> %src2.vec, i64 1
223  %csrc2.el2 = fpext half %src2.el2 to float
224
225  %mul2 = fmul float %csrc1.el2, %csrc2.el1
226  %mul1 = fmul float %csrc1.el1, %csrc2.el2
227  %acc = load float, float addrspace(1)* %dst, align 4
228  %acc1 = fadd float %mul2, %acc
229  %acc2 = fadd float %mul1, %acc1
230  store float %acc2, float addrspace(1)* %dst, align 4
231  ret void
232}