1.section #gk110_builtin_code
2// DIV U32
3//
4// UNR recurrence (q = a / b):
5// look for z such that 2^32 - b <= b * z < 2^32
6// then q - 1 <= (a * z) / 2^32 <= q
7//
8// INPUT:   $r0: dividend, $r1: divisor
9// OUTPUT:  $r0: result, $r1: modulus
10// CLOBBER: $r2 - $r3, $p0 - $p1
11// SIZE:    22 / 14 * 8 bytes
12//
13gk110_div_u32:
14   sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28
15   bfind u32 $r2 $r1
16   xor b32 $r2 $r2 0x1f
17   mov b32 $r3 0x1
18   shl b32 $r2 $r3 clamp $r2
19   cvt u32 $r1 neg u32 $r1
20   mul $r3 u32 $r1 u32 $r2
21   add $r2 (mul high u32 $r2 u32 $r3) $r2
22   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
23   mul $r3 u32 $r1 u32 $r2
24   add $r2 (mul high u32 $r2 u32 $r3) $r2
25   mul $r3 u32 $r1 u32 $r2
26   add $r2 (mul high u32 $r2 u32 $r3) $r2
27   mul $r3 u32 $r1 u32 $r2
28   add $r2 (mul high u32 $r2 u32 $r3) $r2
29   mul $r3 u32 $r1 u32 $r2
30   sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04
31   add $r2 (mul high u32 $r2 u32 $r3) $r2
32   mov b32 $r3 $r0
33   mul high $r0 u32 $r0 u32 $r2
34   cvt u32 $r2 neg u32 $r1
35   add $r1 (mul u32 $r1 u32 $r0) $r3
36   set $p0 0x1 ge u32 $r1 $r2
37   $p0 sub b32 $r1 $r1 $r2
38   sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20
39   $p0 add b32 $r0 $r0 0x1
40   $p0 set $p0 0x1 ge u32 $r1 $r2
41   $p0 sub b32 $r1 $r1 $r2
42   $p0 add b32 $r0 $r0 0x1
43   ret
44
45// DIV S32, like DIV U32 after taking ABS(inputs)
46//
47// INPUT:   $r0: dividend, $r1: divisor
48// OUTPUT:  $r0: result, $r1: modulus
49// CLOBBER: $r2 - $r3, $p0 - $p3
50//
51gk110_div_s32:
52   set $p2 0x1 lt s32 $r0 0x0
53   set $p3 0x1 lt s32 $r1 0x0 xor $p2
54   sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28
55   cvt s32 $r0 abs s32 $r0
56   cvt s32 $r1 abs s32 $r1
57   bfind u32 $r2 $r1
58   xor b32 $r2 $r2 0x1f
59   mov b32 $r3 0x1
60   shl b32 $r2 $r3 clamp $r2
61   cvt u32 $r1 neg u32 $r1
62   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
63   mul $r3 u32 $r1 u32 $r2
64   add $r2 (mul high u32 $r2 u32 $r3) $r2
65   mul $r3 u32 $r1 u32 $r2
66   add $r2 (mul high u32 $r2 u32 $r3) $r2
67   mul $r3 u32 $r1 u32 $r2
68   add $r2 (mul high u32 $r2 u32 $r3) $r2
69   mul $r3 u32 $r1 u32 $r2
70   sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28
71   add $r2 (mul high u32 $r2 u32 $r3) $r2
72   mul $r3 u32 $r1 u32 $r2
73   add $r2 (mul high u32 $r2 u32 $r3) $r2
74   mov b32 $r3 $r0
75   mul high $r0 u32 $r0 u32 $r2
76   cvt u32 $r2 neg u32 $r1
77   add $r1 (mul u32 $r1 u32 $r0) $r3
78   sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
79   set $p0 0x1 ge u32 $r1 $r2
80   $p0 sub b32 $r1 $r1 $r2
81   $p0 add b32 $r0 $r0 0x1
82   $p0 set $p0 0x1 ge u32 $r1 $r2
83   $p0 sub b32 $r1 $r1 $r2
84   $p0 add b32 $r0 $r0 0x1
85   $p3 cvt s32 $r0 neg s32 $r0
86   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
87   $p2 cvt s32 $r1 neg s32 $r1
88   ret
89
90gk110_rcp_f64:
91gk110_rsq_f64:
92   ret
93
94.section #gk110_builtin_offsets
95.b64 #gk110_div_u32
96.b64 #gk110_div_s32
97.b64 #gk110_rcp_f64
98.b64 #gk110_rsq_f64
99