1.section #gk104_builtin_code
2// DIV U32
3//
4// UNR recurrence (q = a / b):
5// look for z such that 2^32 - b <= b * z < 2^32
6// then q - 1 <= (a * z) / 2^32 <= q
7//
8// INPUT:   $r0: dividend, $r1: divisor
9// OUTPUT:  $r0: result, $r1: modulus
10// CLOBBER: $r2 - $r3, $p0 - $p1
11// SIZE:    22 / 14 * 8 bytes
12//
13gk104_div_u32:
14   sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
15   bfind u32 $r2 $r1
16   long xor b32 $r2 $r2 0x1f
17   long mov b32 $r3 0x1
18   shl b32 $r2 $r3 clamp $r2
19   long cvt u32 $r1 neg u32 $r1
20   long mul $r3 u32 $r1 u32 $r2
21   add $r2 (mul high u32 $r2 u32 $r3) $r2
22   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
23   mul $r3 u32 $r1 u32 $r2
24   add $r2 (mul high u32 $r2 u32 $r3) $r2
25   mul $r3 u32 $r1 u32 $r2
26   add $r2 (mul high u32 $r2 u32 $r3) $r2
27   mul $r3 u32 $r1 u32 $r2
28   add $r2 (mul high u32 $r2 u32 $r3) $r2
29   mul $r3 u32 $r1 u32 $r2
30   sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
31   add $r2 (mul high u32 $r2 u32 $r3) $r2
32   mov b32 $r3 $r0
33   mul high $r0 u32 $r0 u32 $r2
34   long cvt u32 $r2 neg u32 $r1
35   long add $r1 (mul u32 $r1 u32 $r0) $r3
36   set $p0 0x1 ge u32 $r1 $r2
37   $p0 sub b32 $r1 $r1 $r2
38   sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
39   $p0 add b32 $r0 $r0 0x1
40   $p0 set $p0 0x1 ge u32 $r1 $r2
41   $p0 sub b32 $r1 $r1 $r2
42   $p0 add b32 $r0 $r0 0x1
43   long ret
44
45// DIV S32, like DIV U32 after taking ABS(inputs)
46//
47// INPUT:   $r0: dividend, $r1: divisor
48// OUTPUT:  $r0: result, $r1: modulus
49// CLOBBER: $r2 - $r3, $p0 - $p3
50//
51gk104_div_s32:
52   set $p2 0x1 lt s32 $r0 0x0
53   set $p3 0x1 lt s32 $r1 0x0 xor $p2
54   sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
55   long cvt s32 $r0 abs s32 $r0
56   long cvt s32 $r1 abs s32 $r1
57   bfind u32 $r2 $r1
58   long xor b32 $r2 $r2 0x1f
59   long mov b32 $r3 0x1
60   shl b32 $r2 $r3 clamp $r2
61   cvt u32 $r1 neg u32 $r1
62   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
63   mul $r3 u32 $r1 u32 $r2
64   add $r2 (mul high u32 $r2 u32 $r3) $r2
65   mul $r3 u32 $r1 u32 $r2
66   add $r2 (mul high u32 $r2 u32 $r3) $r2
67   mul $r3 u32 $r1 u32 $r2
68   add $r2 (mul high u32 $r2 u32 $r3) $r2
69   mul $r3 u32 $r1 u32 $r2
70   sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
71   add $r2 (mul high u32 $r2 u32 $r3) $r2
72   mul $r3 u32 $r1 u32 $r2
73   add $r2 (mul high u32 $r2 u32 $r3) $r2
74   mov b32 $r3 $r0
75   mul high $r0 u32 $r0 u32 $r2
76   long cvt u32 $r2 neg u32 $r1
77   long add $r1 (mul u32 $r1 u32 $r0) $r3
78   sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
79   set $p0 0x1 ge u32 $r1 $r2
80   $p0 sub b32 $r1 $r1 $r2
81   $p0 add b32 $r0 $r0 0x1
82   $p0 set $p0 0x1 ge u32 $r1 $r2
83   $p0 sub b32 $r1 $r1 $r2
84   long $p0 add b32 $r0 $r0 0x1
85   long $p3 cvt s32 $r0 neg s32 $r0
86   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
87   $p2 cvt s32 $r1 neg s32 $r1
88   long ret
89
90// SULDP [for each format]
91// $r4d: address
92// $r2: surface info (format)
93// $p0: access predicate
94// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
95//
96// RGBA32
97$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
98set $p1 0x1 $p1 xor not $p2
99$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
100$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
101long ret
102// RGBA16_UNORM
103sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
104$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
105set $p1 0x1 $p1 xor not $p2
106$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
107$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
108cvt rn f32 $r3 u16 1 $r1
109cvt rn f32 $r2 u16 0 $r1
110mul f32 $r3 $r3 0x37800074
111sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
112cvt rn f32 $r1 u16 1 $r0
113mul f32 $r2 $r2 0x37800074
114cvt rn f32 $r0 u16 0 $r0
115mul f32 $r1 $r1 0x37800074
116mul f32 $r0 $r0 0x37800074
117long ret
118// RGBA16_SNORM
119$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
120sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
121set $p1 0x1 $p1 xor not $p2
122$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
123$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
124cvt rn f32 $r3 s16 1 $r1
125cvt rn f32 $r2 s16 0 $r1
126mul f32 $r3 $r3 0x38000187
127cvt rn f32 $r1 s16 1 $r0
128sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
129mul f32 $r2 $r2 0x38000187
130cvt rn f32 $r0 s16 0 $r0
131mul f32 $r1 $r1 0x38000187
132mul f32 $r0 $r0 0x38000187
133long ret
134// RGBA16_SINT
135$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
136set $p1 0x1 $p1 xor not $p2
137sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
138$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
139$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
140cvt s32 $r3 s16 1 $r1
141cvt s32 $r2 s16 0 $r1
142cvt s32 $r1 s16 1 $r0
143cvt s32 $r0 s16 0 $r0
144long ret
145// RGBA16_UINT
146sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
147$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
148set $p1 0x1 $p1 xor not $p2
149$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
150$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
151cvt u32 $r3 u16 1 $r1
152cvt u32 $r2 u16 0 $r1
153cvt u32 $r1 u16 1 $r0
154sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
155cvt u32 $r0 u16 0 $r0
156long ret
157// RGBA16_FLOAT
158$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
159set $p1 0x1 $p1 xor not $p2
160$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
161$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
162cvt f32 $r3 f16 $r1 1
163sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
164cvt f32 $r2 f16 $r1 0
165cvt f32 $r1 f16 $r0 1
166cvt f32 $r0 f16 $r0 0
167long ret
168// RG32_FLOAT
169$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
170set $p1 0x1 $p1 xor not $p2
171$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
172sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
173$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
174long mov b32 $r2 0x00000000
175long mov b32 $r3 0x3f800000
176long ret
177// RG32_xINT
178$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
179set $p1 0x1 $p1 xor not $p2
180$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
181sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
182$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
183long mov b32 $r2 0x00000000
184long mov b32 $r3 0x00000001
185long ret
186// RGB10A2_UNORM
187$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
188set $p1 0x1 $p1 xor not $p2
189$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
190sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
191$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
192ext u32 $r1 $r0 0x0a0a
193long mov b32 $r3 0x3f800000
194ext u32 $r2 $r0 0x0a14
195long and b32 $r0 $r0 0x3ff
196cvt rn f32 $r2 u16 0 $r2
197cvt rn f32 $r1 u16 0 $r1
198sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
199mul f32 $r2 $r2 0x3a802007
200cvt rn f32 $r0 u16 0 $r0
201mul f32 $r1 $r1 0x3a802007
202mul f32 $r0 $r0 0x3a802007
203long ret
204// RGB10A2_UINT
205$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
206set $p1 0x1 $p1 xor not $p2
207sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
208$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
209$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
210ext u32 $r1 $r0 0x0a0a
211long mov b32 $r3 0x00000001
212ext u32 $r2 $r0 0x0a14
213long and b32 $r0 $r0 0x3ff
214long ret
215// RGBA8_UNORM
216sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
217$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
218set $p1 0x1 $p1 xor not $p2
219$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
220$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
221cvt rn f32 $r3 u8 3 $r0
222cvt rn f32 $r2 u8 2 $r0
223mul f32 $r3 $r3 0x3b808081
224sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
225cvt rn f32 $r1 u8 1 $r0
226mul f32 $r2 $r2 0x3b808081
227cvt rn f32 $r0 u8 0 $r0
228mul f32 $r1 $r1 0x3b808081
229mul f32 $r0 $r0 0x3b808081
230long ret
231// RGBA8_SNORM
232$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
233sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
234set $p1 0x1 $p1 xor not $p2
235$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
236$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
237cvt rn f32 $r3 s8 3 $r0
238cvt rn f32 $r2 s8 2 $r0
239mul f32 $r3 $r3 0x3c010204
240cvt rn f32 $r1 s8 1 $r0
241sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
242mul f32 $r2 $r2 0x3c010204
243cvt rn f32 $r0 s8 0 $r0
244mul f32 $r1 $r1 0x3c010204
245mul f32 $r0 $r0 0x3c010204
246long ret
247// RGBA8_SINT
248$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
249set $p1 0x1 $p1 xor not $p2
250sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
251$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
252$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
253cvt s32 $r3 s8 3 $r0
254cvt s32 $r2 s8 2 $r0
255cvt s32 $r1 s8 1 $r0
256cvt s32 $r0 s8 0 $r0
257long ret
258// RGBA8_UINT
259sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
260$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
261set $p1 0x1 $p1 xor not $p2
262$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
263$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
264cvt u32 $r3 u8 3 $r0
265cvt u32 $r2 u8 2 $r0
266cvt u32 $r1 u8 1 $r0
267sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
268cvt u32 $r0 u8 0 $r0
269long ret
270// R5G6B5_UNORM
271$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
272set $p1 0x1 $p1 xor not $p2
273$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
274$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
275ext u32 $r1 $r0 0x0605
276sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
277long mov b32 $r3 0x3f800000
278ext u32 $r2 $r0 0x050b
279long and b32 $r0 $r0 0x1f
280cvt rn f32 $r2 u8 0 $r2
281cvt rn f32 $r1 u8 0 $r1
282mul f32 $r2 $r2 0x3d042108
283cvt rn f32 $r0 u8 0 $r0
284sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
285mul f32 $r1 $r1 0x3c820821
286mul f32 $r0 $r0 0x3d042108
287long ret
288// R5G5B5X1_UNORM
289$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
290set $p1 0x1 $p1 xor not $p2
291$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
292$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
293sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
294ext u32 $r1 $r0 0x0505
295ext u32 $r2 $r0 0x050a
296long and b32 $r0 $r0 0x1f
297long mov b32 $r3 0x3f800000
298cvt rn f32 $r2 u8 0 $r2
299cvt rn f32 $r1 u8 0 $r1
300cvt rn f32 $r0 u8 0 $r0
301sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
302mul f32 $r2 $r2 0x3d042108
303mul f32 $r1 $r1 0x3d042108
304mul f32 $r0 $r0 0x3d042108
305long ret
306// RG16_UNORM
307$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
308set $p1 0x1 $p1 xor not $p2
309$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
310sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
311$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
312cvt rn f32 $r1 u16 1 $r0
313cvt rn f32 $r0 u16 0 $r0
314mul f32 $r1 $r1 0x37800074
315mul f32 $r0 $r0 0x37800074
316long mov b32 $r2 0x00000000
317long mov b32 $r3 0x3f800000
318sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
319long ret
320// RG16_SNORM
321$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
322set $p1 0x1 $p1 xor not $p2
323$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
324$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
325mov b32 $r3 0x3f800000
326cvt rn f32 $r1 s16 1 $r0
327sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
328mov b32 $r2 0x00000000
329cvt rn f32 $r0 s16 0 $r0
330mul f32 $r1 $r1 0x38000187
331mul f32 $r0 $r0 0x38000187
332long ret
333// RG16_SINT
334$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
335set $p1 0x1 $p1 xor not $p2
336sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
337$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
338$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
339mov b32 $r3 0x00000001
340cvt s32 $r1 s16 1 $r0
341mov b32 $r2 0x00000000
342cvt s32 $r0 s16 0 $r0
343long ret
344// RG16_UINT
345sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
346$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
347set $p1 0x1 $p1 xor not $p2
348$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
349$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
350mov b32 $r3 0x00000001
351cvt u32 $r1 u16 1 $r0
352mov b32 $r2 0x00000000
353sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
354cvt u32 $r0 u16 0 $r0
355long ret
356// RG16_FLOAT
357$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
358set $p1 0x1 $p1 xor not $p2
359$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
360$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
361mov b32 $r3 0x3f800000
362sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
363cvt f32 $r1 f16 $r0 1
364mov b32 $r2 0x00000000
365cvt f32 $r0 f16 $r0 0
366long ret
367// R32_FLOAT
368$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
369set $p1 0x1 $p1 xor not $p2
370$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
371sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
372$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
373long mov b32 $r3 0x3f800000
374long mov b32 $r2 0x00000000
375long mov b32 $r1 0x00000000
376long ret
377// R32_xINT
378$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
379set $p1 0x1 $p1 xor not $p2
380sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
381$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
382$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
383long mov b32 $r3 0x00000001
384long mov b32 $r2 0x00000000
385long mov b32 $r1 0x00000000
386long ret
387// RG8_UNORM
388$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
389sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
390set $p1 0x1 $p1 xor not $p2
391$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
392$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
393mov b32 $r3 0x3f800000
394cvt rn f32 $r1 u8 1 $r0
395mov b32 $r2 0x00000000
396cvt rn f32 $r0 u8 0 $r0
397sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
398mul f32 $r1 $r1 0x3b808081
399mul f32 $r0 $r0 0x3b808081
400long ret
401// RG8_SNORM
402$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
403set $p1 0x1 $p1 xor not $p2
404$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
405$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
406sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
407long mov b32 $r3 0x3f800000
408cvt rn f32 $r1 s8 1 $r0
409long mov b32 $r2 0x00000000
410cvt rn f32 $r0 s8 0 $r0
411mul f32 $r1 $r1 0x3c010204
412mul f32 $r0 $r0 0x3c010204
413long ret
414// RG8_UINT
415sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
416$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
417set $p1 0x1 $p1 xor not $p2
418$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
419$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
420long mov b32 $r3 0x00000001
421cvt u32 $r1 u8 1 $r0
422long mov b32 $r2 0x00000000
423sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
424cvt u32 $r0 u8 0 $r0
425long ret
426// RG8_SINT
427$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
428set $p1 0x1 $p1 xor not $p2
429$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
430$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
431long mov b32 $r3 0x00000001
432sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
433cvt s32 $r1 s8 1 $r0
434long mov b32 $r2 0x00000000
435cvt s32 $r0 s8 0 $r0
436long ret
437// R16_UNORM
438$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
439set $p1 0x1 $p1 xor not $p2
440$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
441sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
442$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
443long mov b32 $r3 0x3f800000
444cvt rn f32 $r0 u16 0 $r0
445long mov b32 $r2 0x00000000
446long mov b32 $r1 0x00000000
447mul f32 $r0 $r0 0x37800074
448long ret
449// R16_SNORM
450sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
451$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
452set $p1 0x1 $p1 xor not $p2
453$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
454$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
455mov b32 $r3 0x3f800000
456cvt rn f32 $r0 s16 0 $r0
457long mov b32 $r2 0x00000000
458sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
459long mov b32 $r1 0x00000000
460mul f32 $r0 $r0 0x38000187
461long ret
462// R16_SINT
463$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
464set $p1 0x1 $p1 xor not $p2
465$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
466$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
467sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
468long mov b32 $r3 0x00000001
469long mov b32 $r2 0x00000000
470long mov b32 $r1 0x00000000
471long ret
472// R16_UINT
473$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
474set $p1 0x1 $p1 xor not $p2
475$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
476sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
477$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
478long mov b32 $r3 0x00000001
479long mov b32 $r2 0x00000000
480long mov b32 $r1 0x00000000
481long ret
482// R16_FLOAT
483$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
484set $p1 0x1 $p1 xor not $p2
485sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
486$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
487$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
488long mov b32 $r3 0x3f800000
489long mov b32 $r2 0x00000000
490cvt f32 $r0 f16 $r0 0
491mov b32 $r1 0x00000000
492long ret
493// R8_UNORM
494sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
495$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
496set $p1 0x1 $p1 xor not $p2
497$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
498$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
499mov b32 $r3 0x3f800000
500cvt rn f32 $r0 u8 0 $r0
501mov b32 $r2 0x00000000
502sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
503mul f32 $r0 $r0 0x3b808081
504mov b32 $r1 0x00000000
505long ret
506// R8_SNORM
507$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
508set $p1 0x1 $p1 xor not $p2
509$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
510$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
511sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
512mov b32 $r3 0x3f800000
513cvt rn f32 $r0 s8 0 $r0
514mov b32 $r2 0x00000000
515mul f32 $r0 $r0 0x3c010204
516mov b32 $r1 0x00000000
517long ret
518// R8_SINT
519$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
520sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
521set $p1 0x1 $p1 xor not $p2
522$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
523$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
524long mov b32 $r3 0x00000001
525long mov b32 $r2 0x00000000
526long mov b32 $r1 0x00000000
527long ret
528// R8_UINT
529sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
530$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
531set $p1 0x1 $p1 xor not $p2
532$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
533$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
534long mov b32 $r3 0x00000001
535long mov b32 $r2 0x00000000
536long mov b32 $r1 0x00000000
537sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
538long ret
539// R11G11B10_FLOAT TODO
540$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
541set $p1 0x1 $p1 xor not $p2
542$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
543$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
544long mov b32 $r3 0x3f800000
545long nop
546sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
547long nop
548long ret
549
550
551// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
552//
553// INPUT:   $r0d (x)
554// OUTPUT:  $r0d (rcp(x))
555// CLOBBER: $r2 - $r7
556// SIZE:    9 * 8 bytes
557//
558gk104_rcp_f64:
559   // Step 1: classify input according to exponent and value, and calculate
560   // result for 0/inf/nan. $r2 holds the exponent value, which starts at
561   // bit 52 (bit 20 of the upper half) and is 11 bits in length
562   ext u32 $r2 $r1 0xb14
563   add b32 $r3 $r2 0xffffffff
564   joinat #rcp_rejoin
565   // We want to check whether the exponent is 0 or 0x7ff (i.e. NaN, inf,
566   // denorm, or 0). Do this by substracting 1 from the exponent, which will
567   // mean that it's > 0x7fd in those cases when doing unsigned comparison
568   set $p0 0x1 gt u32 $r3 0x7fd
569   // $r3: 0 for norms, 0x36 for denorms, -1 for others
570   long mov b32 $r3 0x0
571   sched 0x2f 0x04 0x2d 0x2b 0x2f 0x28 0x28
572   join (not $p0) nop
573   // Process all special values: NaN, inf, denorm, 0
574   mov b32 $r3 0xffffffff
575   // A number is NaN if its abs value is greater than or unordered with inf
576   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
577   (not $p0) bra #rcp_inf_or_denorm_or_zero
578   // NaN -> NaN, the next line sets the "quiet" bit of the result. This
579   // behavior is both seen on the CPU and the blob
580   join or b32 $r1 $r1 0x80000
581rcp_inf_or_denorm_or_zero:
582   and b32 $r4 $r1 0x7ff00000
583   // Other values with nonzero in exponent field should be inf
584   set $p0 0x1 eq s32 $r4 0x0
585   sched 0x2b 0x04 0x2f 0x2d 0x2b 0x2f 0x20
586   $p0 bra #rcp_denorm_or_zero
587   // +/-Inf -> +/-0
588   xor b32 $r1 $r1 0x7ff00000
589   join mov b32 $r0 0x0
590rcp_denorm_or_zero:
591   set $p0 0x1 gtu f64 abs $r0d 0x0
592   $p0 bra #rcp_denorm
593   // +/-0 -> +/-Inf
594   join or b32 $r1 $r1 0x7ff00000
595rcp_denorm:
596   // non-0 denorms: multiply with 2^54 (the 0x36 in $r3), join with norms
597   mul rn f64 $r0d $r0d 0x4350000000000000
598   sched 0x2f 0x28 0x2b 0x28 0x28 0x04 0x28
599   join mov b32 $r3 0x36
600rcp_rejoin:
601   // All numbers with -1 in $r3 have their result ready in $r0d, return them
602   // others need further calculation
603   set $p0 0x1 lt s32 $r3 0x0
604   $p0 bra #rcp_end
605   // Step 2: Before the real calculation goes on, renormalize the values to
606   // range [1, 2) by setting exponent field to 0x3ff (the exponent of 1)
607   // result in $r6d. The exponent will be recovered later.
608   ext u32 $r2 $r1 0xb14
609   and b32 $r7 $r1 0x800fffff
610   add b32 $r7 $r7 0x3ff00000
611   long mov b32 $r6 $r0
612   sched 0x2b 0x04 0x28 0x28 0x2a 0x2b 0x2e
613   // Step 3: Convert new value to float (no overflow will occur due to step
614   // 2), calculate rcp and do newton-raphson step once
615   cvt rz f32 $r5 f64 $r6d
616   long rcp f32 $r4 $r5
617   mov b32 $r0 0xbf800000
618   fma rn f32 $r5 $r4 $r5 $r0
619   fma rn f32 $r0 neg $r4 $r5 $r4
620   // Step 4: convert result $r0 back to double, do newton-raphson steps
621   cvt f64 $r0d f32 $r0
622   cvt f64 $r6d neg f64 $r6d
623   sched 0x2e 0x29 0x29 0x29 0x29 0x29 0x29
624   cvt f64 $r8d f32 0x3f800000
625   // 4 Newton-Raphson Steps, tmp in $r4d, result in $r0d
626   // The formula used here (and above) is:
627   //     RCP_{n + 1} = 2 * RCP_{n} - x * RCP_{n} * RCP_{n}
628   // The following code uses 2 FMAs for each step, and it will basically
629   // looks like:
630   //     tmp = -src * RCP_{n} + 1
631   //     RCP_{n + 1} = RCP_{n} * tmp + RCP_{n}
632   fma rn f64 $r4d $r6d $r0d $r8d
633   fma rn f64 $r0d $r0d $r4d $r0d
634   fma rn f64 $r4d $r6d $r0d $r8d
635   fma rn f64 $r0d $r0d $r4d $r0d
636   fma rn f64 $r4d $r6d $r0d $r8d
637   fma rn f64 $r0d $r0d $r4d $r0d
638   sched 0x29 0x20 0x28 0x28 0x28 0x28 0x28
639   fma rn f64 $r4d $r6d $r0d $r8d
640   fma rn f64 $r0d $r0d $r4d $r0d
641   // Step 5: Exponent recovery and final processing
642   // The exponent is recovered by adding what we added to the exponent.
643   // Suppose we want to calculate rcp(x), but we have rcp(cx), then
644   //     rcp(x) = c * rcp(cx)
645   // The delta in exponent comes from two sources:
646   //   1) The renormalization in step 2. The delta is:
647   //      0x3ff - $r2
648   //   2) (For the denorm input) The 2^54 we multiplied at rcp_denorm, stored
649   //      in $r3
650   // These 2 sources are calculated in the first two lines below, and then
651   // added to the exponent extracted from the result above.
652   // Note that after processing, the new exponent may >= 0x7ff (inf)
653   // or <= 0 (denorm). Those cases will be handled respectively below
654   subr b32 $r2 $r2 0x3ff
655   long add b32 $r4 $r2 $r3
656   ext u32 $r3 $r1 0xb14
657   // New exponent in $r3
658   long add b32 $r3 $r3 $r4
659   add b32 $r2 $r3 0xffffffff
660   sched 0x28 0x2b 0x28 0x2b 0x28 0x28 0x2b
661   // (exponent-1) < 0x7fe (unsigned) means the result is in norm range
662   // (same logic as in step 1)
663   set $p0 0x1 lt u32 $r2 0x7fe
664   (not $p0) bra #rcp_result_inf_or_denorm
665   // Norms: convert exponents back and return
666   shl b32 $r4 $r4 clamp 0x14
667   long add b32 $r1 $r4 $r1
668   bra #rcp_end
669rcp_result_inf_or_denorm:
670   // New exponent >= 0x7ff means that result is inf
671   set $p0 0x1 ge s32 $r3 0x7ff
672   (not $p0) bra #rcp_result_denorm
673   sched 0x20 0x25 0x28 0x2b 0x23 0x25 0x2f
674   // Infinity
675   and b32 $r1 $r1 0x80000000
676   long mov b32 $r0 0x0
677   add b32 $r1 $r1 0x7ff00000
678   bra #rcp_end
679rcp_result_denorm:
680   // Denorm result comes from huge input. The greatest possible fp64, i.e.
681   // 0x7fefffffffffffff's rcp is 0x0004000000000000, 1/4 of the smallest
682   // normal value. Other rcp result should be greater than that. If we
683   // set the exponent field to 1, we can recover the result by multiplying
684   // it with 1/2 or 1/4. 1/2 is used if the "exponent" $r3 is 0, otherwise
685   // 1/4 ($r3 should be -1 then). This is quite tricky but greatly simplifies
686   // the logic here.
687   set $p0 0x1 ne u32 $r3 0x0
688   and b32 $r1 $r1 0x800fffff
689   // 0x3e800000: 1/4
690   $p0 cvt f64 $r6d f32 0x3e800000
691   sched 0x2f 0x28 0x2c 0x2e 0x2a 0x20 0x27
692   // 0x3f000000: 1/2
693   (not $p0) cvt f64 $r6d f32 0x3f000000
694   add b32 $r1 $r1 0x00100000
695   mul rn f64 $r0d $r0d $r6d
696rcp_end:
697   long ret
698
699// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
700//
701// INPUT:   $r0d (x)
702// OUTPUT:  $r0d (rsqrt(x))
703// CLOBBER: $r2 - $r7
704// SIZE:    14 * 8 bytes
705//
706gk104_rsq_f64:
707   // Before getting initial result rsqrt64h, two special cases should be
708   // handled first.
709   // 1. NaN: set the highest bit in mantissa so it'll be surely recognized
710   //    as NaN in rsqrt64h
711   set $p0 0x1 gtu f64 abs $r0d 0x7ff0000000000000
712   $p0 or b32 $r1 $r1 0x00080000
713   and b32 $r2 $r1 0x7fffffff
714   sched 0x27 0x20 0x28 0x2c 0x25 0x28 0x28
715   // 2. denorms and small normal values: using their original value will
716   //    lose precision either at rsqrt64h or the first step in newton-raphson
717   //    steps below. Take 2 as a threshold in exponent field, and multiply
718   //    with 2^54 if the exponent is smaller or equal. (will multiply 2^27
719   //    to recover in the end)
720   ext u32 $r3 $r1 0xb14
721   set $p1 0x1 le u32 $r3 0x2
722   long or b32 $r2 $r0 $r2
723   $p1 mul rn f64 $r0d $r0d 0x4350000000000000
724   rsqrt64h $r5 $r1
725   // rsqrt64h will give correct result for 0/inf/nan, the following logic
726   // checks whether the input is one of those (exponent is 0x7ff or all 0
727   // except for the sign bit)
728   set b32 $r6 ne u32 $r3 0x7ff
729   long and b32 $r2 $r2 $r6
730   sched 0x28 0x2b 0x20 0x27 0x28 0x2e 0x28
731   set $p0 0x1 ne u32 $r2 0x0
732   $p0 bra #rsq_norm
733   // For 0/inf/nan, make sure the sign bit agrees with input and return
734   and b32 $r1 $r1 0x80000000
735   long mov b32 $r0 0x0
736   long or b32 $r1 $r1 $r5
737   long ret
738rsq_norm:
739   // For others, do 4 Newton-Raphson steps with the formula:
740   //     RSQ_{n + 1} = RSQ_{n} * (1.5 - 0.5 * x * RSQ_{n} * RSQ_{n})
741   // In the code below, each step is written as:
742   //     tmp1 = 0.5 * x * RSQ_{n}
743   //     tmp2 = -RSQ_{n} * tmp1 + 0.5
744   //     RSQ_{n + 1} = RSQ_{n} * tmp2 + RSQ_{n}
745   long mov b32 $r4 0x0
746   sched 0x2f 0x29 0x29 0x29 0x29 0x29 0x29
747   // 0x3f000000: 1/2
748   cvt f64 $r8d f32 0x3f000000
749   mul rn f64 $r2d $r0d $r8d
750   mul rn f64 $r0d $r2d $r4d
751   fma rn f64 $r6d neg $r4d $r0d $r8d
752   fma rn f64 $r4d $r4d $r6d $r4d
753   mul rn f64 $r0d $r2d $r4d
754   fma rn f64 $r6d neg $r4d $r0d $r8d
755   sched 0x29 0x29 0x29 0x29 0x29 0x29 0x29
756   fma rn f64 $r4d $r4d $r6d $r4d
757   mul rn f64 $r0d $r2d $r4d
758   fma rn f64 $r6d neg $r4d $r0d $r8d
759   fma rn f64 $r4d $r4d $r6d $r4d
760   mul rn f64 $r0d $r2d $r4d
761   fma rn f64 $r6d neg $r4d $r0d $r8d
762   fma rn f64 $r4d $r4d $r6d $r4d
763   sched 0x29 0x20 0x28 0x2e 0x00 0x00 0x00
764   // Multiply 2^27 to result for small inputs to recover
765   $p1 mul rn f64 $r4d $r4d 0x41a0000000000000
766   long mov b32 $r1 $r5
767   long mov b32 $r0 $r4
768   long ret
769
770//
771// Trap handler.
772// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
773// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
774//
775// Trap info:
776// 0x000: mutex
777// 0x004: PC
778// 0x008: trapstat
779// 0x00c: warperr
780// 0x010: tidx
781// 0x014: tidy
782// 0x018: tidz
783// 0x01c: ctaidx
784// 0x020: ctaidy
785// 0x024: ctaidz
786// 0x030: $r0q
787// 0x130: $flags
788// 0x140: s[]
789//
790st b128 wb l[0x00] $r0q
791// check state of the warp and continue if it didn't cause the trap
792long mov b32 $r1 $trapstat
793long mov b32 $r3 $warperr
794mov $r2 $flags mask 0xffff
795and b32 0 $c $r1 $r3
796e $c bra #end_cont
797// spill control flow stack to l[]
798long mov b32 $r3 16
799spill_cfstack:
800preret #end_exit
801sub b32 $r3 $c $r3 0x1
802lg $c bra #spill_cfstack
803// retrieve pointer to trap info
804mov b32 $r0 c0[0x1900]
805mov b32 $r1 c0[0x1904]
806// we only let a single faulting thread store its state
807mov b32 $r3 0x1
808exch b32 $r3 g[$r0d] $r3
809joinat #end_exit
810set $p0 0x1 eq u32 $r3 0x1
811join $p0 nop
812// store $c and $p registers
813st b32 wb g[$r0d+0x130] $r2
814// store $trapstat and $warperr
815long mov b32 $r2 $trapstat
816long mov b32 $r3 $warperr
817st b64 wb g[$r0d+0x8] $r2d
818// store registers
819st b128 wb g[$r0d+0x40] $r4q
820st b128 wb g[$r0d+0x50] $r8q
821st b128 wb g[$r0d+0x60] $r12q
822st b128 wb g[$r0d+0x70] $r16q
823st b128 wb g[$r0d+0x80] $r20q
824st b128 wb g[$r0d+0x90] $r24q
825st b128 wb g[$r0d+0xa0] $r28q
826st b128 wb g[$r0d+0xb0] $r32q
827st b128 wb g[$r0d+0xc0] $r36q
828st b128 wb g[$r0d+0xd0] $r40q
829st b128 wb g[$r0d+0xe0] $r44q
830st b128 wb g[$r0d+0xf0] $r48q
831st b128 wb g[$r0d+0x100] $r52q
832st b128 wb g[$r0d+0x110] $r56q
833st b128 wb g[$r0d+0x120] $r60q
834ld b64 $r2d cs l[0x0]
835st b64 wb g[$r0d+0x30] $r2d
836ld b64 $r2d cs l[0x8]
837st b64 wb g[$r0d+0x38] $r2d
838// store thread id
839long mov b32 $r2 $tidx
840long mov b32 $r3 $tidy
841st b64 wb g[$r0d+0x10] $r2d
842long mov b32 $r2 $tidz
843long mov b32 $r3 $ctaidx
844st b64 wb g[$r0d+0x18] $r2d
845long mov b32 $r2 $ctaidy
846long mov b32 $r3 $ctaidz
847st b64 wb g[$r0d+0x20] $r2d
848// store shared memory (in reverse order so $r0d is base again at the end)
849long mov b32 $r3 $smemsz
850sub b32 $r3 $c $r3 0x4
851s $c bra #shared_done
852add b32 $r0 $c $r0 $r3
853add b32 $r1 $r1 0x0 $c
854shared_loop:
855long ld b32 $r2 s[$r3]
856long st b32 wb g[$r0d+0x140] $r2
857sub b32 $r0 $c $r0 0x4
858sub b32 $r1 $r1 0x0 $c
859sub b32 $r3 $c $r3 0x4
860lg $c bra #shared_loop
861shared_done:
862// search the stack for trap entry to retrieve PC
863mov b32 $r0 c0[0x1908]
864mov b32 $r1 c0[0x190c]
865membar sys
866// invalidate caches so we can read stack entries via g[]
867cctl ivall 0 l[0]
868cctl ivall 0 g[$r0d]
869// get offsets
870mov b32 $r2 $physid
871ext u32 $r3 $r2 0x0814 // MP id
872ext u32 $r2 $r2 0x0608 // warp id
873mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
874mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
875add b32 $r2 $r2 $r3 // MP + warp offset
876add b32 $r0 $c $r0 $r2
877add b32 $r1 $r1 0x0 $c
878search_cstack:
879mov b32 $r3 c0[0x1918] // cstack size
880ld u8 $r2 cv g[$r0d+0x8]
881set $p0 0x1 eq u32 $r2 0xa
882$p0 bra #entry_found
883add b32 $r0 $c $r0 0x10
884add b32 $r1 $r1 0x0 $c
885sub b32 $r3 $c $r3 0x10
886lg $c bra #search_cstack
887bra #end_exit
888entry_found:
889// load PC (may be unaligned and spread out)
890ld b32 $r2 cv g[$r0d]
891mov b32 $r0 c0[0x1900]
892mov b32 $r1 c0[0x1904]
893st b32 wb g[$r0d+0x4] $r2
894join nop
895// invalidate caches and exit
896end_exit:
897cctl ivall 0 g[0]
898bpt pause 0x0
899rtt terminate
900end_cont:
901bpt pause 0x0
902mov $flags $r2 mask 0xffff
903ld b128 $r0q cs l[0x00]
904rtt
905
906.section #gk104_builtin_offsets
907.b64 #gk104_div_u32
908.b64 #gk104_div_s32
909.b64 #gk104_rcp_f64
910.b64 #gk104_rsq_f64
911