1.section #gf100_builtin_code 2// DIV U32 3// 4// UNR recurrence (q = a / b): 5// look for z such that 2^32 - b <= b * z < 2^32 6// then q - 1 <= (a * z) / 2^32 <= q 7// 8// INPUT: $r0: dividend, $r1: divisor 9// OUTPUT: $r0: result, $r1: modulus 10// CLOBBER: $r2 - $r3, $p0 - $p1 11// SIZE: 22 / 14 * 8 bytes 12// 13gf100_div_u32: 14 bfind u32 $r2 $r1 15 xor b32 $r2 $r2 0x1f 16 mov b32 $r3 0x1 17 shl b32 $r2 $r3 clamp $r2 18 cvt u32 $r1 neg u32 $r1 19 mul $r3 u32 $r1 u32 $r2 20 add $r2 (mul high u32 $r2 u32 $r3) $r2 21 mul $r3 u32 $r1 u32 $r2 22 add $r2 (mul high u32 $r2 u32 $r3) $r2 23 mul $r3 u32 $r1 u32 $r2 24 add $r2 (mul high u32 $r2 u32 $r3) $r2 25 mul $r3 u32 $r1 u32 $r2 26 add $r2 (mul high u32 $r2 u32 $r3) $r2 27 mul $r3 u32 $r1 u32 $r2 28 add $r2 (mul high u32 $r2 u32 $r3) $r2 29 mov b32 $r3 $r0 30 mul high $r0 u32 $r0 u32 $r2 31 cvt u32 $r2 neg u32 $r1 32 add $r1 (mul u32 $r1 u32 $r0) $r3 33 set $p0 0x1 ge u32 $r1 $r2 34 $p0 sub b32 $r1 $r1 $r2 35 $p0 add b32 $r0 $r0 0x1 36 $p0 set $p0 0x1 ge u32 $r1 $r2 37 $p0 sub b32 $r1 $r1 $r2 38 $p0 add b32 $r0 $r0 0x1 39 ret 40 41// DIV S32, like DIV U32 after taking ABS(inputs) 42// 43// INPUT: $r0: dividend, $r1: divisor 44// OUTPUT: $r0: result, $r1: modulus 45// CLOBBER: $r2 - $r3, $p0 - $p3 46// 47gf100_div_s32: 48 set $p2 0x1 lt s32 $r0 0x0 49 set $p3 0x1 lt s32 $r1 0x0 xor $p2 50 cvt s32 $r0 abs s32 $r0 51 cvt s32 $r1 abs s32 $r1 52 bfind u32 $r2 $r1 53 xor b32 $r2 $r2 0x1f 54 mov b32 $r3 0x1 55 shl b32 $r2 $r3 clamp $r2 56 cvt u32 $r1 neg u32 $r1 57 mul $r3 u32 $r1 u32 $r2 58 add $r2 (mul high u32 $r2 u32 $r3) $r2 59 mul $r3 u32 $r1 u32 $r2 60 add $r2 (mul high u32 $r2 u32 $r3) $r2 61 mul $r3 u32 $r1 u32 $r2 62 add $r2 (mul high u32 $r2 u32 $r3) $r2 63 mul $r3 u32 $r1 u32 $r2 64 add $r2 (mul high u32 $r2 u32 $r3) $r2 65 mul $r3 u32 $r1 u32 $r2 66 add $r2 (mul high u32 $r2 u32 $r3) $r2 67 mov b32 $r3 $r0 68 mul high $r0 u32 $r0 u32 $r2 69 cvt u32 $r2 neg u32 $r1 70 add $r1 (mul u32 $r1 u32 $r0) $r3 71 set $p0 0x1 ge u32 $r1 $r2 72 $p0 sub b32 $r1 $r1 $r2 73 $p0 add b32 $r0 $r0 0x1 74 $p0 set $p0 0x1 ge u32 $r1 $r2 75 $p0 sub b32 $r1 $r1 $r2 76 $p0 add b32 $r0 $r0 0x1 77 $p3 cvt s32 $r0 neg s32 $r0 78 $p2 cvt s32 $r1 neg s32 $r1 79 ret 80 81// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) 82// 83// INPUT: $r0d (x) 84// OUTPUT: $r0d (rcp(x)) 85// CLOBBER: $r2 - $r7 86// SIZE: 9 * 8 bytes 87// 88gf100_rcp_f64: 89 nop 90 ret 91 92// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) 93// 94// INPUT: $r0d (x) 95// OUTPUT: $r0d (rsqrt(x)) 96// CLOBBER: $r2 - $r7 97// SIZE: 14 * 8 bytes 98// 99gf100_rsq_f64: 100 nop 101 ret 102 103.section #gf100_builtin_offsets 104.b64 #gf100_div_u32 105.b64 #gf100_div_s32 106.b64 #gf100_rcp_f64 107.b64 #gf100_rsq_f64 108