1; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
3; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
6
7; Make sure fdiv is promoted to f32.
8
9; GCN-LABEL: {{^}}v_fdiv_f16
10; SI:     v_cvt_f32_f16
11; SI:     v_cvt_f32_f16
12; SI:     v_div_scale_f32
13; SI-DAG: v_div_scale_f32
14; SI-DAG: v_rcp_f32
15; SI:     v_fma_f32
16; SI:     v_fma_f32
17; SI:     v_mul_f32
18; SI:     v_fma_f32
19; SI:     v_fma_f32
20; SI:     v_fma_f32
21; SI:     v_div_fmas_f32
22; SI:     v_div_fixup_f32
23; SI:     v_cvt_f16_f32
24
25; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
26; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
27
28; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
29; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
30
31; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
32; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
33; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
34; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
35; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
36define amdgpu_kernel void @v_fdiv_f16(
37    half addrspace(1)* %r,
38    half addrspace(1)* %a,
39    half addrspace(1)* %b) #0 {
40entry:
41  %tid = call i32 @llvm.amdgcn.workitem.id.x()
42  %tid.ext = sext i32 %tid to i64
43  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
44  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
45  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
46  %a.val = load volatile half, half addrspace(1)* %gep.a
47  %b.val = load volatile half, half addrspace(1)* %gep.b
48  %r.val = fdiv half %a.val, %b.val
49  store half %r.val, half addrspace(1)* %gep.r
50  ret void
51}
52
53; GCN-LABEL: {{^}}v_rcp_f16:
54; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
55; GFX8_9_10-NOT: [[VAL]]
56; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
57; GFX8_9_10-NOT: [[RESULT]]
58; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
59define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
60entry:
61  %tid = call i32 @llvm.amdgcn.workitem.id.x()
62  %tid.ext = sext i32 %tid to i64
63  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
64  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
65  %b.val = load volatile half, half addrspace(1)* %gep.b
66  %r.val = fdiv half 1.0, %b.val, !fpmath !0
67  store half %r.val, half addrspace(1)* %gep.r
68  ret void
69}
70
71; GCN-LABEL: {{^}}v_rcp_f16_abs:
72; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
73; GFX8_9_10-NOT: [[VAL]]
74; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
75; GFX8_9_10-NOT: [RESULT]]
76; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
77define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
78entry:
79  %tid = call i32 @llvm.amdgcn.workitem.id.x()
80  %tid.ext = sext i32 %tid to i64
81  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
82  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
83  %b.val = load volatile half, half addrspace(1)* %gep.b
84  %b.abs = call half @llvm.fabs.f16(half %b.val)
85  %r.val = fdiv half 1.0, %b.abs, !fpmath !0
86  store half %r.val, half addrspace(1)* %gep.r
87  ret void
88}
89
90; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
91
92; GCN-LABEL: {{^}}reciprocal_f16_rounded:
93; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{.+}}
94; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
95; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
96; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
97; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
98; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
99define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
100entry:
101  %tid = call i32 @llvm.amdgcn.workitem.id.x()
102  %tid.ext = sext i32 %tid to i64
103  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
104  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
105  %b.val = load volatile half, half addrspace(1)* %gep.b
106  %r.val = fdiv half 1.0, %b.val
107  store half %r.val, half addrspace(1)* %gep.r
108  ret void
109}
110
111; GCN-LABEL: {{^}}v_rcp_f16_afn:
112; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
113; GFX8_9_10-NOT: [[VAL]]
114; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
115; GFX8_9_10-NOT: [[RESULT]]
116; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
117define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
118entry:
119  %tid = call i32 @llvm.amdgcn.workitem.id.x()
120  %tid.ext = sext i32 %tid to i64
121  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
122  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
123  %b.val = load volatile half, half addrspace(1)* %gep.b
124  %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
125  store half %r.val, half addrspace(1)* %gep.r
126  ret void
127}
128
129; GCN-LABEL: {{^}}v_rcp_f16_neg:
130; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
131; GFX8_9_10-NOT: [[VAL]]
132; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
133; GFX8_9_10-NOT: [RESULT]]
134; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
135define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
136entry:
137  %tid = call i32 @llvm.amdgcn.workitem.id.x()
138  %tid.ext = sext i32 %tid to i64
139  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
140  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
141  %b.val = load volatile half, half addrspace(1)* %gep.b
142  %r.val = fdiv half -1.0, %b.val, !fpmath !0
143  store half %r.val, half addrspace(1)* %gep.r
144  ret void
145}
146
147; GCN-LABEL: {{^}}v_rsq_f16:
148; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
149; GFX8_9_10-NOT: [[VAL]]
150; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
151; GFX8_9_10-NOT: [RESULT]]
152; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
153define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
154entry:
155  %tid = call i32 @llvm.amdgcn.workitem.id.x()
156  %tid.ext = sext i32 %tid to i64
157  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
158  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
159  %b.val = load volatile half, half addrspace(1)* %gep.b
160  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
161  %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
162  store half %r.val, half addrspace(1)* %gep.r
163  ret void
164}
165
166; GCN-LABEL: {{^}}v_rsq_f16_neg:
167; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
168; GFX8_9_10-NOT: [[VAL]]
169; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
170; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
171; GFX8_9_10-NOT: [RESULT]]
172; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
173define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
174entry:
175  %tid = call i32 @llvm.amdgcn.workitem.id.x()
176  %tid.ext = sext i32 %tid to i64
177  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
178  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
179  %b.val = load volatile half, half addrspace(1)* %gep.b
180  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
181  %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
182  store half %r.val, half addrspace(1)* %gep.r
183  ret void
184}
185
186; GCN-LABEL: {{^}}v_fdiv_f16_afn:
187; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
188; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
189
190; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
191; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
192
193; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
194define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
195entry:
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %tid.ext = sext i32 %tid to i64
198  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
199  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
200  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
201  %a.val = load volatile half, half addrspace(1)* %gep.a
202  %b.val = load volatile half, half addrspace(1)* %gep.b
203  %r.val = fdiv afn half %a.val, %b.val
204  store half %r.val, half addrspace(1)* %gep.r
205  ret void
206}
207
208; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
209; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
210; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
211
212; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
213; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
214
215; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]]
216define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
217entry:
218  %tid = call i32 @llvm.amdgcn.workitem.id.x()
219  %tid.ext = sext i32 %tid to i64
220  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
221  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
222  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
223  %a.val = load volatile half, half addrspace(1)* %gep.a
224  %b.val = load volatile half, half addrspace(1)* %gep.b
225  %r.val = fdiv half %a.val, %b.val
226  store half %r.val, half addrspace(1)* %gep.r
227  ret void
228}
229
230; SI-LABEL: {{^}}div_afn_2_x_pat_f16:
231; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
232
233; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
234; GFX8_9_10: buffer_store_short [[MUL]]
235define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 {
236  %x = load half, half addrspace(1)* undef
237  %rcp = fdiv afn half %x, 2.0
238  store half %rcp, half addrspace(1)* %out, align 4
239  ret void
240}
241
242; SI-LABEL: {{^}}div_afn_k_x_pat_f16:
243; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
244
245; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
246; GFX8_9_10: buffer_store_short [[MUL]]
247define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 {
248  %x = load half, half addrspace(1)* undef
249  %rcp = fdiv afn half %x, 10.0
250  store half %rcp, half addrspace(1)* %out, align 4
251  ret void
252}
253
254; SI-LABEL: {{^}}div_afn_neg_k_x_pat_f16:
255; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
256
257; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
258; GFX8_9_10: buffer_store_short [[MUL]]
259define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
260  %x = load half, half addrspace(1)* undef
261  %rcp = fdiv afn half %x, -10.0
262  store half %rcp, half addrspace(1)* %out, align 4
263  ret void
264}
265
266declare i32 @llvm.amdgcn.workitem.id.x() #2
267declare half @llvm.sqrt.f16(half) #2
268declare half @llvm.fabs.f16(half) #2
269
270attributes #0 = { nounwind }
271attributes #1 = { nounwind readnone }
272attributes #2 = { nounwind "unsafe-fp-math"="true" }
273
274!0 = !{float 2.500000e+00}
275