1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 2; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 3; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s 4; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s 5 6; These tests check that fdiv is expanded correctly and also test that the 7; scheduler is scheduling the RECIP_IEEE and MUL_IEEE instructions in separate 8; instruction groups. 9 10; These test check that fdiv using unsafe_fp_math, coarse fp div, and IEEE754 fp div. 11 12; FUNC-LABEL: {{^}}fdiv_f32: 13; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 14; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 15 16; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 17; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 18; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 19 20; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 21; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 22; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 23; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 24; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 25; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 26; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 27; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 28; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 29; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 30define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { 31entry: 32 %fdiv = fdiv float %a, %b 33 store float %fdiv, float addrspace(1)* %out 34 ret void 35} 36 37; FUNC-LABEL: {{^}}fdiv_f32_denormals: 38; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 39; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 40 41; GCN: v_div_scale_f32 [[NUM_SCALE:v[0-9]+]] 42; GCN-DAG: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]] 43; GCN-DAG: v_rcp_f32_e32 [[NUM_RCP:v[0-9]+]], [[NUM_SCALE]] 44 45; GCN-NOT: s_setreg 46; GCN: v_fma_f32 [[A:v[0-9]+]], -[[NUM_SCALE]], [[NUM_RCP]], 1.0 47; GCN: v_fma_f32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]], [[NUM_RCP]] 48; GCN: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]] 49; GCN: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]] 50; GCN: v_fma_f32 [[E:v[0-9]+]], [[D]], [[B]], [[C]] 51; GCN: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]] 52; GCN-NOT: s_setreg 53; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]] 54; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]], 55define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 { 56entry: 57 %fdiv = fdiv float %a, %b 58 store float %fdiv, float addrspace(1)* %out 59 ret void 60} 61 62; FUNC-LABEL: {{^}}fdiv_25ulp_f32: 63; GCN: v_cndmask_b32 64; GCN: v_mul_f32 65; GCN: v_rcp_f32 66; GCN: v_mul_f32 67; GCN: v_mul_f32 68define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 { 69entry: 70 %fdiv = fdiv float %a, %b, !fpmath !0 71 store float %fdiv, float addrspace(1)* %out 72 ret void 73} 74 75; Use correct fdiv 76; FUNC-LABEL: {{^}}fdiv_25ulp_denormals_f32: 77; GCN: v_fma_f32 78; GCN: v_div_fmas_f32 79; GCN: v_div_fixup_f32 80define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 81entry: 82 %fdiv = fdiv float %a, %b, !fpmath !0 83 store float %fdiv, float addrspace(1)* %out 84 ret void 85} 86 87; FUNC-LABEL: {{^}}fdiv_fast_denormals_f32: 88; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 89; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 90; GCN-NOT: [[RESULT]] 91; GCN-NOT: s_setreg 92; GCN: buffer_store_dword [[RESULT]] 93define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 { 94entry: 95 %fdiv = fdiv fast float %a, %b 96 store float %fdiv, float addrspace(1)* %out 97 ret void 98} 99 100; FUNC-LABEL: {{^}}fdiv_f32_fast_math: 101; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 102; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 103 104; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 105; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 106; GCN-NOT: [[RESULT]] 107; GCN: buffer_store_dword [[RESULT]] 108define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { 109entry: 110 %fdiv = fdiv fast float %a, %b 111 store float %fdiv, float addrspace(1)* %out 112 ret void 113} 114 115; FUNC-LABEL: {{^}}fdiv_ulp25_f32_fast_math: 116; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 117; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 118 119; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 120; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 121; GCN-NOT: [[RESULT]] 122; GCN: buffer_store_dword [[RESULT]] 123define amdgpu_kernel void @fdiv_ulp25_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 { 124entry: 125 %fdiv = fdiv fast float %a, %b, !fpmath !0 126 store float %fdiv, float addrspace(1)* %out 127 ret void 128} 129 130; FUNC-LABEL: {{^}}fdiv_f32_arcp_math: 131; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W 132; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, PS 133 134; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], s{{[0-9]+}} 135; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]] 136; GCN-NOT: [[RESULT]] 137; GCN: buffer_store_dword [[RESULT]] 138define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 { 139entry: 140 %fdiv = fdiv arcp float %a, %b 141 store float %fdiv, float addrspace(1)* %out 142 ret void 143} 144 145; FUNC-LABEL: {{^}}fdiv_v2f32: 146; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 147; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 148; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 149; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 150 151; GCN: v_div_scale_f32 152; GCN: v_div_scale_f32 153; GCN: v_div_scale_f32 154; GCN: v_div_scale_f32 155define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 156entry: 157 %fdiv = fdiv <2 x float> %a, %b 158 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 159 ret void 160} 161 162; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32: 163; GCN: v_rcp_f32 164; GCN: v_rcp_f32 165; GCN-NOT: v_cmp_gt_f32 166define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 167entry: 168 %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 169 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 170 ret void 171} 172 173; FUNC-LABEL: {{^}}fdiv_v2f32_fast_math: 174; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 175; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 176; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 177; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 178 179; GCN: v_rcp_f32 180; GCN: v_rcp_f32 181define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 182entry: 183 %fdiv = fdiv fast <2 x float> %a, %b 184 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 185 ret void 186} 187 188; FUNC-LABEL: {{^}}fdiv_v2f32_arcp_math: 189; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Z 190; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[3].Y 191; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, PS 192; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, PS 193 194; GCN: v_rcp_f32 195; GCN: v_rcp_f32 196define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { 197entry: 198 %fdiv = fdiv arcp <2 x float> %a, %b 199 store <2 x float> %fdiv, <2 x float> addrspace(1)* %out 200 ret void 201} 202 203; FUNC-LABEL: {{^}}fdiv_v4f32: 204; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 205; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 206; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 207; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 208; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 209; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 210; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 211; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 212 213; GCN: v_div_fixup_f32 214; GCN: v_div_fixup_f32 215; GCN: v_div_fixup_f32 216; GCN: v_div_fixup_f32 217define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 218 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 219 %a = load <4 x float>, <4 x float> addrspace(1) * %in 220 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 221 %result = fdiv <4 x float> %a, %b 222 store <4 x float> %result, <4 x float> addrspace(1)* %out 223 ret void 224} 225 226; FUNC-LABEL: {{^}}fdiv_v4f32_fast_math: 227; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 228; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 229; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 230; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 231; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 232; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 233; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 234; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 235 236; GCN: v_rcp_f32 237; GCN: v_rcp_f32 238; GCN: v_rcp_f32 239; GCN: v_rcp_f32 240define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 241 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 242 %a = load <4 x float>, <4 x float> addrspace(1) * %in 243 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 244 %result = fdiv fast <4 x float> %a, %b 245 store <4 x float> %result, <4 x float> addrspace(1)* %out 246 ret void 247} 248 249; FUNC-LABEL: {{^}}fdiv_v4f32_arcp_math: 250; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 251; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 252; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 253; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} 254; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 255; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 256; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 257; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, PS 258 259; GCN: v_rcp_f32 260; GCN: v_rcp_f32 261; GCN: v_rcp_f32 262; GCN: v_rcp_f32 263define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 264 %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1 265 %a = load <4 x float>, <4 x float> addrspace(1) * %in 266 %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr 267 %result = fdiv arcp <4 x float> %a, %b 268 store <4 x float> %result, <4 x float> addrspace(1)* %out 269 ret void 270} 271 272attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,-flat-for-global" } 273attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } 274attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } 275 276!0 = !{float 2.500000e+00} 277