1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5
6; These tests check that fdiv is expanded correctly and also test that the
7; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate
8; instruction groups.
9
10; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div.
11
12; FUNC-LABEL: {{^}}fdiv_f32:
13; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
14; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
15
16; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
17; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
18; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
19
20; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
21; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
22; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
23; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
24; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
25; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
26; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
27; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
28; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
29; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
30define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
31entry:
32  %fdiv = fdiv float %a, %b
33  store float %fdiv, float addrspace(1)* %out
34  ret void
35}
36
37; FUNC-LABEL: {{^}}fdiv_f32_denormals:
38; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
39; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
40
41; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]]
42; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
43; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]]
44
45; GCN-NOT: s_setreg
46; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0
47; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]]
48; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
49; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
50; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]]
51; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
52; GCN-NOT: s_setreg
53; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
54; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
55define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
56entry:
57  %fdiv = fdiv float %a, %b
58  store float %fdiv, float addrspace(1)* %out
59  ret void
60}
61
62; FUNC-LABEL: {{^}}fdiv_25ulp_f32:
63; GCN: v_cndmask_b32
64; GCN: v_mul_f32
65; GCN: v_rcp_f32
66; GCN: v_mul_f32
67; GCN: v_mul_f32
68define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
69entry:
70  %fdiv = fdiv float %a, %b, !fpmath !0
71  store float %fdiv, float addrspace(1)* %out
72  ret void
73}
74
75; Use correct fdiv
76; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32:
77; GCN: v_fma_f32
78; GCN: v_div_fmas_f32
79; GCN: v_div_fixup_f32
80define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
81entry:
82  %fdiv = fdiv float %a, %b, !fpmath !0
83  store float %fdiv, float addrspace(1)* %out
84  ret void
85}
86
87; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32:
88; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
89; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
90; GCN-NOT: [[RESULT]]
91; GCN-NOT: s_setreg
92; GCN: buffer_store_dword [[RESULT]]
93define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
94entry:
95  %fdiv = fdiv fast float %a, %b
96  store float %fdiv, float addrspace(1)* %out
97  ret void
98}
99
100; FUNC-LABEL: {{^}}fdiv_f32_fast_math:
101; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
102; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
103
104; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
105; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
106; GCN-NOT: [[RESULT]]
107; GCN: buffer_store_dword [[RESULT]]
108define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
109entry:
110  %fdiv = fdiv fast float %a, %b
111  store float %fdiv, float addrspace(1)* %out
112  ret void
113}
114
115; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math:
116; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
117; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
118
119; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
120; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
121; GCN-NOT: [[RESULT]]
122; GCN: buffer_store_dword [[RESULT]]
123define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
124entry:
125  %fdiv = fdiv fast float %a, %b, !fpmath !0
126  store float %fdiv, float addrspace(1)* %out
127  ret void
128}
129
130; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
131; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
132; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS
133
134; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}}
135; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
136; GCN-NOT: [[RESULT]]
137; GCN: buffer_store_dword [[RESULT]]
138define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
139entry:
140  %fdiv = fdiv arcp float %a, %b
141  store float %fdiv, float addrspace(1)* %out
142  ret void
143}
144
145; FUNC-LABEL: {{^}}fdiv_v2f32:
146; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
147; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
148; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
149; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
150
151; GCN: v_div_scale_f32
152; GCN: v_div_scale_f32
153; GCN: v_div_scale_f32
154; GCN: v_div_scale_f32
155define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
156entry:
157  %fdiv = fdiv <2 x float> %a, %b
158  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
159  ret void
160}
161
162; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
163; GCN: v_rcp_f32
164; GCN: v_rcp_f32
165; GCN-NOT: v_cmp_gt_f32
166define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
167entry:
168  %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
169  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
170  ret void
171}
172
173; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math:
174; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
175; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
176; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
177; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
178
179; GCN: v_rcp_f32
180; GCN: v_rcp_f32
181define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
182entry:
183  %fdiv = fdiv fast <2 x float> %a, %b
184  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
185  ret void
186}
187
188; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math:
189; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z
190; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y
191; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS
192; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS
193
194; GCN: v_rcp_f32
195; GCN: v_rcp_f32
196define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
197entry:
198  %fdiv = fdiv arcp <2 x float> %a, %b
199  store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
200  ret void
201}
202
203; FUNC-LABEL: {{^}}fdiv_v4f32:
204; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
205; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
206; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
207; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
208; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
209; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
210; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
211; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
212
213; GCN: v_div_fixup_f32
214; GCN: v_div_fixup_f32
215; GCN: v_div_fixup_f32
216; GCN: v_div_fixup_f32
217define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
218  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
219  %a = load <4 x float>, <4 x float> addrspace(1) * %in
220  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
221  %result = fdiv <4 x float> %a, %b
222  store <4 x float> %result, <4 x float> addrspace(1)* %out
223  ret void
224}
225
226; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math:
227; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
228; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
229; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
230; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
231; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
232; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
233; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
234; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
235
236; GCN: v_rcp_f32
237; GCN: v_rcp_f32
238; GCN: v_rcp_f32
239; GCN: v_rcp_f32
240define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
241  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
242  %a = load <4 x float>, <4 x float> addrspace(1) * %in
243  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
244  %result = fdiv fast <4 x float> %a, %b
245  store <4 x float> %result, <4 x float> addrspace(1)* %out
246  ret void
247}
248
249; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math:
250; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
251; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
252; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
253; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
254; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
255; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
256; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
257; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS
258
259; GCN: v_rcp_f32
260; GCN: v_rcp_f32
261; GCN: v_rcp_f32
262; GCN: v_rcp_f32
263define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
264  %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
265  %a = load <4 x float>, <4 x float> addrspace(1) * %in
266  %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
267  %result = fdiv arcp <4 x float> %a, %b
268  store <4 x float> %result, <4 x float> addrspace(1)* %out
269  ret void
270}
271
272attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,-flat-for-global" }
273attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
274attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
275
276!0 = !{float 2.500000e+00}
277