1.section #gk110_builtin_code 2// DIV U32 3// 4// UNR recurrence (q = a / b): 5// look for z such that 2^32 - b <= b * z < 2^32 6// then q - 1 <= (a * z) / 2^32 <= q 7// 8// INPUT: $r0: dividend, $r1: divisor 9// OUTPUT: $r0: result, $r1: modulus 10// CLOBBER: $r2 - $r3, $p0 - $p1 11// SIZE: 22 / 14 * 8 bytes 12// 13gk110_div_u32: 14 sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28 15 bfind u32 $r2 $r1 16 xor b32 $r2 $r2 0x1f 17 mov b32 $r3 0x1 18 shl b32 $r2 $r3 clamp $r2 19 cvt u32 $r1 neg u32 $r1 20 mul $r3 u32 $r1 u32 $r2 21 add $r2 (mul high u32 $r2 u32 $r3) $r2 22 sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 23 mul $r3 u32 $r1 u32 $r2 24 add $r2 (mul high u32 $r2 u32 $r3) $r2 25 mul $r3 u32 $r1 u32 $r2 26 add $r2 (mul high u32 $r2 u32 $r3) $r2 27 mul $r3 u32 $r1 u32 $r2 28 add $r2 (mul high u32 $r2 u32 $r3) $r2 29 mul $r3 u32 $r1 u32 $r2 30 sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04 31 add $r2 (mul high u32 $r2 u32 $r3) $r2 32 mov b32 $r3 $r0 33 mul high $r0 u32 $r0 u32 $r2 34 cvt u32 $r2 neg u32 $r1 35 add $r1 (mul u32 $r1 u32 $r0) $r3 36 set $p0 0x1 ge u32 $r1 $r2 37 $p0 sub b32 $r1 $r1 $r2 38 sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20 39 $p0 add b32 $r0 $r0 0x1 40 $p0 set $p0 0x1 ge u32 $r1 $r2 41 $p0 sub b32 $r1 $r1 $r2 42 $p0 add b32 $r0 $r0 0x1 43 ret 44 45// DIV S32, like DIV U32 after taking ABS(inputs) 46// 47// INPUT: $r0: dividend, $r1: divisor 48// OUTPUT: $r0: result, $r1: modulus 49// CLOBBER: $r2 - $r3, $p0 - $p3 50// 51gk110_div_s32: 52 set $p2 0x1 lt s32 $r0 0x0 53 set $p3 0x1 lt s32 $r1 0x0 xor $p2 54 sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28 55 cvt s32 $r0 abs s32 $r0 56 cvt s32 $r1 abs s32 $r1 57 bfind u32 $r2 $r1 58 xor b32 $r2 $r2 0x1f 59 mov b32 $r3 0x1 60 shl b32 $r2 $r3 clamp $r2 61 cvt u32 $r1 neg u32 $r1 62 sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28 63 mul $r3 u32 $r1 u32 $r2 64 add $r2 (mul high u32 $r2 u32 $r3) $r2 65 mul $r3 u32 $r1 u32 $r2 66 add $r2 (mul high u32 $r2 u32 $r3) $r2 67 mul $r3 u32 $r1 u32 $r2 68 add $r2 (mul high u32 $r2 u32 $r3) $r2 69 mul $r3 u32 $r1 u32 $r2 70 sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28 71 add $r2 (mul high u32 $r2 u32 $r3) $r2 72 mul $r3 u32 $r1 u32 $r2 73 add $r2 (mul high u32 $r2 u32 $r3) $r2 74 mov b32 $r3 $r0 75 mul high $r0 u32 $r0 u32 $r2 76 cvt u32 $r2 neg u32 $r1 77 add $r1 (mul u32 $r1 u32 $r0) $r3 78 sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20 79 set $p0 0x1 ge u32 $r1 $r2 80 $p0 sub b32 $r1 $r1 $r2 81 $p0 add b32 $r0 $r0 0x1 82 $p0 set $p0 0x1 ge u32 $r1 $r2 83 $p0 sub b32 $r1 $r1 $r2 84 $p0 add b32 $r0 $r0 0x1 85 $p3 cvt s32 $r0 neg s32 $r0 86 sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c 87 $p2 cvt s32 $r1 neg s32 $r1 88 ret 89 90gk110_rcp_f64: 91gk110_rsq_f64: 92 ret 93 94.section #gk110_builtin_offsets 95.b64 #gk110_div_u32 96.b64 #gk110_div_s32 97.b64 #gk110_rcp_f64 98.b64 #gk110_rsq_f64 99