1.section #gf100_builtin_code
2// DIV U32
3//
4// UNR recurrence (q = a / b):
5// look for z such that 2^32 - b <= b * z < 2^32
6// then q - 1 <= (a * z) / 2^32 <= q
7//
8// INPUT:   $r0: dividend, $r1: divisor
9// OUTPUT:  $r0: result, $r1: modulus
10// CLOBBER: $r2 - $r3, $p0 - $p1
11// SIZE:    22 / 14 * 8 bytes
12//
13gf100_div_u32:
14   bfind u32 $r2 $r1
15   xor b32 $r2 $r2 0x1f
16   mov b32 $r3 0x1
17   shl b32 $r2 $r3 clamp $r2
18   cvt u32 $r1 neg u32 $r1
19   mul $r3 u32 $r1 u32 $r2
20   add $r2 (mul high u32 $r2 u32 $r3) $r2
21   mul $r3 u32 $r1 u32 $r2
22   add $r2 (mul high u32 $r2 u32 $r3) $r2
23   mul $r3 u32 $r1 u32 $r2
24   add $r2 (mul high u32 $r2 u32 $r3) $r2
25   mul $r3 u32 $r1 u32 $r2
26   add $r2 (mul high u32 $r2 u32 $r3) $r2
27   mul $r3 u32 $r1 u32 $r2
28   add $r2 (mul high u32 $r2 u32 $r3) $r2
29   mov b32 $r3 $r0
30   mul high $r0 u32 $r0 u32 $r2
31   cvt u32 $r2 neg u32 $r1
32   add $r1 (mul u32 $r1 u32 $r0) $r3
33   set $p0 0x1 ge u32 $r1 $r2
34   $p0 sub b32 $r1 $r1 $r2
35   $p0 add b32 $r0 $r0 0x1
36   $p0 set $p0 0x1 ge u32 $r1 $r2
37   $p0 sub b32 $r1 $r1 $r2
38   $p0 add b32 $r0 $r0 0x1
39   ret
40
41// DIV S32, like DIV U32 after taking ABS(inputs)
42//
43// INPUT:   $r0: dividend, $r1: divisor
44// OUTPUT:  $r0: result, $r1: modulus
45// CLOBBER: $r2 - $r3, $p0 - $p3
46//
47gf100_div_s32:
48   set $p2 0x1 lt s32 $r0 0x0
49   set $p3 0x1 lt s32 $r1 0x0 xor $p2
50   cvt s32 $r0 abs s32 $r0
51   cvt s32 $r1 abs s32 $r1
52   bfind u32 $r2 $r1
53   xor b32 $r2 $r2 0x1f
54   mov b32 $r3 0x1
55   shl b32 $r2 $r3 clamp $r2
56   cvt u32 $r1 neg u32 $r1
57   mul $r3 u32 $r1 u32 $r2
58   add $r2 (mul high u32 $r2 u32 $r3) $r2
59   mul $r3 u32 $r1 u32 $r2
60   add $r2 (mul high u32 $r2 u32 $r3) $r2
61   mul $r3 u32 $r1 u32 $r2
62   add $r2 (mul high u32 $r2 u32 $r3) $r2
63   mul $r3 u32 $r1 u32 $r2
64   add $r2 (mul high u32 $r2 u32 $r3) $r2
65   mul $r3 u32 $r1 u32 $r2
66   add $r2 (mul high u32 $r2 u32 $r3) $r2
67   mov b32 $r3 $r0
68   mul high $r0 u32 $r0 u32 $r2
69   cvt u32 $r2 neg u32 $r1
70   add $r1 (mul u32 $r1 u32 $r0) $r3
71   set $p0 0x1 ge u32 $r1 $r2
72   $p0 sub b32 $r1 $r1 $r2
73   $p0 add b32 $r0 $r0 0x1
74   $p0 set $p0 0x1 ge u32 $r1 $r2
75   $p0 sub b32 $r1 $r1 $r2
76   $p0 add b32 $r0 $r0 0x1
77   $p3 cvt s32 $r0 neg s32 $r0
78   $p2 cvt s32 $r1 neg s32 $r1
79   ret
80
81// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
82//
83// INPUT:   $r0d (x)
84// OUTPUT:  $r0d (rcp(x))
85// CLOBBER: $r2 - $r7
86// SIZE:    9 * 8 bytes
87//
88gf100_rcp_f64:
89   nop
90   ret
91
92// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
93//
94// INPUT:   $r0d (x)
95// OUTPUT:  $r0d (rsqrt(x))
96// CLOBBER: $r2 - $r7
97// SIZE:    14 * 8 bytes
98//
99gf100_rsq_f64:
100   nop
101   ret
102
103.section #gf100_builtin_offsets
104.b64 #gf100_div_u32
105.b64 #gf100_div_s32
106.b64 #gf100_rcp_f64
107.b64 #gf100_rsq_f64
108