1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
3; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
5
6; Make sure fdiv is promoted to f32.
7
8; GCN-LABEL: {{^}}v_fdiv_f16
9; SI:     v_cvt_f32_f16
10; SI:     v_cvt_f32_f16
11; SI:     v_div_scale_f32
12; SI-DAG: v_div_scale_f32
13; SI-DAG: v_rcp_f32
14; SI:     v_fma_f32
15; SI:     v_fma_f32
16; SI:     v_mul_f32
17; SI:     v_fma_f32
18; SI:     v_fma_f32
19; SI:     v_fma_f32
20; SI:     v_div_fmas_f32
21; SI:     v_div_fixup_f32
22; SI:     v_cvt_f16_f32
23
24; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
25; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
26
27; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
28; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
29
30; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
31; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
32; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
33; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
34; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
35define amdgpu_kernel void @v_fdiv_f16(
36    half addrspace(1)* %r,
37    half addrspace(1)* %a,
38    half addrspace(1)* %b) #0 {
39entry:
40  %tid = call i32 @llvm.amdgcn.workitem.id.x()
41  %tid.ext = sext i32 %tid to i64
42  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
43  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
44  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
45  %a.val = load volatile half, half addrspace(1)* %gep.a
46  %b.val = load volatile half, half addrspace(1)* %gep.b
47  %r.val = fdiv half %a.val, %b.val
48  store half %r.val, half addrspace(1)* %gep.r
49  ret void
50}
51
52; GCN-LABEL: {{^}}v_rcp_f16:
53; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
54; GFX8_9-NOT: [[VAL]]
55; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
56; GFX8_9-NOT: [[RESULT]]
57; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
58define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
59entry:
60  %tid = call i32 @llvm.amdgcn.workitem.id.x()
61  %tid.ext = sext i32 %tid to i64
62  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
63  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
64  %b.val = load volatile half, half addrspace(1)* %gep.b
65  %r.val = fdiv half 1.0, %b.val
66  store half %r.val, half addrspace(1)* %gep.r
67  ret void
68}
69
70; GCN-LABEL: {{^}}v_rcp_f16_abs:
71; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
72; GFX8_9-NOT: [[VAL]]
73; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
74; GFX8_9-NOT: [RESULT]]
75; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
76define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
77entry:
78  %tid = call i32 @llvm.amdgcn.workitem.id.x()
79  %tid.ext = sext i32 %tid to i64
80  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
81  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
82  %b.val = load volatile half, half addrspace(1)* %gep.b
83  %b.abs = call half @llvm.fabs.f16(half %b.val)
84  %r.val = fdiv half 1.0, %b.abs
85  store half %r.val, half addrspace(1)* %gep.r
86  ret void
87}
88
89; GCN-LABEL: {{^}}v_rcp_f16_arcp:
90; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
91; GFX8_9-NOT: [[VAL]]
92; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
93; GFX8_9-NOT: [[RESULT]]
94; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
95define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
96entry:
97  %tid = call i32 @llvm.amdgcn.workitem.id.x()
98  %tid.ext = sext i32 %tid to i64
99  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
100  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
101  %b.val = load volatile half, half addrspace(1)* %gep.b
102  %r.val = fdiv arcp half 1.0, %b.val
103  store half %r.val, half addrspace(1)* %gep.r
104  ret void
105}
106
107; GCN-LABEL: {{^}}v_rcp_f16_neg:
108; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
109; GFX8_9-NOT: [[VAL]]
110; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
111; GFX8_9-NOT: [RESULT]]
112; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
113define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
114entry:
115  %tid = call i32 @llvm.amdgcn.workitem.id.x()
116  %tid.ext = sext i32 %tid to i64
117  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
118  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
119  %b.val = load volatile half, half addrspace(1)* %gep.b
120  %r.val = fdiv half -1.0, %b.val
121  store half %r.val, half addrspace(1)* %gep.r
122  ret void
123}
124
125; GCN-LABEL: {{^}}v_rsq_f16:
126; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
127; GFX8_9-NOT: [[VAL]]
128; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
129; GFX8_9-NOT: [RESULT]]
130; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
131define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
132entry:
133  %tid = call i32 @llvm.amdgcn.workitem.id.x()
134  %tid.ext = sext i32 %tid to i64
135  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
136  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
137  %b.val = load volatile half, half addrspace(1)* %gep.b
138  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
139  %r.val = fdiv half 1.0, %b.sqrt
140  store half %r.val, half addrspace(1)* %gep.r
141  ret void
142}
143
144; GCN-LABEL: {{^}}v_rsq_f16_neg:
145; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
146; GFX8_9-NOT: [[VAL]]
147; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
148; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
149; GFX8_9-NOT: [RESULT]]
150; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
151define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
152entry:
153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
154  %tid.ext = sext i32 %tid to i64
155  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
156  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
157  %b.val = load volatile half, half addrspace(1)* %gep.b
158  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
159  %r.val = fdiv half -1.0, %b.sqrt
160  store half %r.val, half addrspace(1)* %gep.r
161  ret void
162}
163
164; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
165; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
166; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
167
168; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
169; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
170
171; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
172define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
173entry:
174  %tid = call i32 @llvm.amdgcn.workitem.id.x()
175  %tid.ext = sext i32 %tid to i64
176  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
177  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
178  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
179  %a.val = load volatile half, half addrspace(1)* %gep.a
180  %b.val = load volatile half, half addrspace(1)* %gep.b
181  %r.val = fdiv arcp half %a.val, %b.val
182  store half %r.val, half addrspace(1)* %gep.r
183  ret void
184}
185
186; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
187; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
188; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
189
190; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
191; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
192
193; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
194define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
195entry:
196  %tid = call i32 @llvm.amdgcn.workitem.id.x()
197  %tid.ext = sext i32 %tid to i64
198  %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext
199  %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
200  %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
201  %a.val = load volatile half, half addrspace(1)* %gep.a
202  %b.val = load volatile half, half addrspace(1)* %gep.b
203  %r.val = fdiv half %a.val, %b.val
204  store half %r.val, half addrspace(1)* %gep.r
205  ret void
206}
207
208; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
209; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
210
211; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
212; GFX8_9: buffer_store_short [[MUL]]
213define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
214  %x = load half, half addrspace(1)* undef
215  %rcp = fdiv arcp half %x, 2.0
216  store half %rcp, half addrspace(1)* %out, align 4
217  ret void
218}
219
220; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
221; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
222
223; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
224; GFX8_9: buffer_store_short [[MUL]]
225define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
226  %x = load half, half addrspace(1)* undef
227  %rcp = fdiv arcp half %x, 10.0
228  store half %rcp, half addrspace(1)* %out, align 4
229  ret void
230}
231
232; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
233; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
234
235; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
236; GFX8_9: buffer_store_short [[MUL]]
237define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
238  %x = load half, half addrspace(1)* undef
239  %rcp = fdiv arcp half %x, -10.0
240  store half %rcp, half addrspace(1)* %out, align 4
241  ret void
242}
243
244declare i32 @llvm.amdgcn.workitem.id.x() #1
245declare half @llvm.sqrt.f16(half) #1
246declare half @llvm.fabs.f16(half) #1
247
248attributes #0 = { nounwind }
249attributes #1 = { nounwind readnone }
250attributes #2 = { nounwind "unsafe-fp-math"="true" }
251