1 /*-------------------------------------------------------------------------
2  * drawElements Base Portability Library
3  * -------------------------------------
4  *
5  * Copyright 2014 The Android Open Source Project
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  * you may not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  *      http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  *//*!
20  * \file
21  * \brief 16-bit floating-point math.
22  *//*--------------------------------------------------------------------*/
23 
24 #include "deFloat16.h"
25 
26 DE_BEGIN_EXTERN_C
27 
deFloat32To16(float val32)28 deFloat16 deFloat32To16 (float val32)
29 {
30 	deUint32	sign;
31 	int			expotent;
32 	deUint32	mantissa;
33 	union
34 	{
35 		float		f;
36 		deUint32	u;
37 	} x;
38 
39 	x.f			= val32;
40 	sign		= (x.u >> 16u) & 0x00008000u;
41 	expotent	= (int)((x.u >> 23u) & 0x000000ffu) - (127 - 15);
42 	mantissa	= x.u & 0x007fffffu;
43 
44 	if (expotent <= 0)
45 	{
46 		if (expotent < -10)
47 		{
48 			/* Rounds to zero. */
49 			return (deFloat16) sign;
50 		}
51 
52 		/* Converted to denormalized half, add leading 1 to significand. */
53 		mantissa = mantissa | 0x00800000u;
54 
55 		/* Round mantissa to nearest (10+e) */
56 		{
57 			deUint32 t = 14u - expotent;
58 			deUint32 a = (1u << (t - 1u)) - 1u;
59 			deUint32 b = (mantissa >> t) & 1u;
60 
61 			mantissa = (mantissa + a + b) >> t;
62 		}
63 
64 		return (deFloat16) (sign | mantissa);
65 	}
66 	else if (expotent == 0xff - (127 - 15))
67 	{
68 		if (mantissa == 0u)
69 		{
70 			/* InF */
71 			return (deFloat16) (sign | 0x7c00u);
72 		}
73 		else
74 		{
75 			/* NaN */
76 			mantissa >>= 13u;
77 			return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
78 		}
79 	}
80 	else
81 	{
82 		/* Normalized float. */
83 		mantissa = mantissa + 0x00000fffu + ((mantissa >> 13u) & 1u);
84 
85 		if (mantissa & 0x00800000u)
86 		{
87 			/* Overflow in mantissa. */
88 			mantissa  = 0u;
89 			expotent += 1;
90 		}
91 
92 		if (expotent > 30)
93 		{
94 			/* \todo [pyry] Cause hw fp overflow */
95 			return (deFloat16) (sign | 0x7c00u);
96 		}
97 
98 		return (deFloat16) (sign | ((deUint32)expotent << 10u) | (mantissa >> 13u));
99 	}
100 }
101 
deFloat64To16(double val64)102 deFloat16 deFloat64To16 (double val64)
103 {
104 	deUint64	sign;
105 	long		expotent;
106 	deUint64	mantissa;
107 	union
108 	{
109 		double		f;
110 		deUint64	u;
111 	} x;
112 
113 	x.f			= val64;
114 	sign		= (x.u >> 48u) & 0x00008000u;
115 	expotent	= (long int)((x.u >> 52u) & 0x000007ffu) - (1023 - 15);
116 	mantissa	= x.u & 0x00fffffffffffffu;
117 
118 	if (expotent <= 0)
119 	{
120 		if (expotent < -10)
121 		{
122 			/* Rounds to zero. */
123 			return (deFloat16) sign;
124 		}
125 
126 		/* Converted to denormalized half, add leading 1 to significand. */
127 		mantissa = mantissa | 0x0010000000000000u;
128 
129 		/* Round mantissa to nearest (10+e) */
130 		{
131 			deUint64 t = 43u - expotent;
132 			deUint64 a = (1u << (t - 1u)) - 1u;
133 			deUint64 b = (mantissa >> t) & 1u;
134 
135 			mantissa = (mantissa + a + b) >> t;
136 		}
137 
138 		return (deFloat16) (sign | mantissa);
139 	}
140 	else if (expotent == 0x7ff - (1023 - 15))
141 	{
142 		if (mantissa == 0u)
143 		{
144 			/* InF */
145 			return (deFloat16) (sign | 0x7c00u);
146 		}
147 		else
148 		{
149 			/* NaN */
150 			mantissa >>= 42u;
151 			return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
152 		}
153 	}
154 	else
155 	{
156 		/* Normalized float. */
157 		mantissa = mantissa + 0x000001ffffffffffu + ((mantissa >> 42u) & 1u);
158 
159 		if (mantissa & 0x010000000000000u)
160 		{
161 			/* Overflow in mantissa. */
162 			mantissa  = 0u;
163 			expotent += 1;
164 		}
165 
166 		if (expotent > 30)
167 		{
168 			return (deFloat16) (sign | 0x7c00u);
169 		}
170 
171 		return (deFloat16) (sign | ((deUint32)expotent << 10u) | (mantissa >> 13u));
172 	}
173 }
174 
175 /*--------------------------------------------------------------------*//*!
176  * \brief Round the given number `val` to nearest even by discarding
177  *        the last `numBitsToDiscard` bits.
178  * \param val value to round
179  * \param numBitsToDiscard number of (least significant) bits to discard
180  * \return The rounded value with the last `numBitsToDiscard` removed
181  *//*--------------------------------------------------------------------*/
roundToNearestEven(deUint32 val,const deUint32 numBitsToDiscard)182 static deUint32 roundToNearestEven (deUint32 val, const deUint32 numBitsToDiscard)
183 {
184 	const deUint32	lastBits	= val & ((1 << numBitsToDiscard) - 1);
185 	const deUint32	headBit		= val & (1 << (numBitsToDiscard - 1));
186 
187 	DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 32);	/* Make sure no overflow. */
188 	val >>= numBitsToDiscard;
189 
190 	if (headBit == 0)
191 	{
192 		return val;
193 	}
194 	else if (headBit == lastBits)
195 	{
196 		if ((val & 0x1) == 0x1)
197 		{
198 			return val + 1;
199 		}
200 		else
201 		{
202 			return val;
203 		}
204 	}
205 	else
206 	{
207 		return val + 1;
208 	}
209 }
210 
deFloat32To16Round(float val32,deRoundingMode mode)211 deFloat16 deFloat32To16Round (float val32, deRoundingMode mode)
212 {
213 	union
214 	{
215 		float		f;		/* Interpret as 32-bit float */
216 		deUint32	u;		/* Interpret as 32-bit unsigned integer */
217 	} x;
218 	deUint32	sign;		/* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
219 	deUint32	exp32;		/* exp32: biased exponent for 32-bit floats */
220 	int			exp16;		/* exp16: biased exponent for 16-bit floats */
221 	deUint32	mantissa;
222 
223 	/* We only support these two rounding modes for now */
224 	DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
225 
226 	x.f			= val32;
227 	sign		= (x.u >> 16u) & 0x00008000u;
228 	exp32		= (x.u >> 23u) & 0x000000ffu;
229 	exp16		= (int) (exp32) - 127 + 15;	/* 15/127: exponent bias for 16-bit/32-bit floats */
230 	mantissa	= x.u & 0x007fffffu;
231 
232 	/* Case: zero and denormalized floats */
233 	if (exp32 == 0)
234 	{
235 		/* Denormalized floats are < 2^(1-127), not representable in 16-bit floats, rounding to zero. */
236 		return (deFloat16) sign;
237 	}
238 	/* Case: Inf and NaN */
239 	else if (exp32 == 0x000000ffu)
240 	{
241 		if (mantissa == 0u)
242 		{
243 			/* Inf */
244 			return (deFloat16) (sign | 0x7c00u);
245 		}
246 		else
247 		{
248 			/* NaN */
249 			mantissa >>= 13u;	/* 16-bit floats has 10-bit for mantissa, 13-bit less than 32-bit floats. */
250 			/* Make sure we don't turn NaN into zero by | (mantissa == 0). */
251 			return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
252 		}
253 	}
254 	/* The following are cases for normalized floats.
255 	 *
256 	 * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
257 	 *   we can only shift the mantissa further right.
258 	 *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
259 	 *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
260 	 *   So, we just need to right shift the mantissa -exp16 bits.
261 	 * * If exp16 is 0, mantissa shifting requirement is similar to the above.
262 	 * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
263 	 */
264 	/* Case: normalized floats -> zero */
265 	else if (exp16 < -10)
266 	{
267 		/* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
268 		/* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
269 		return (deFloat16) sign;
270 	}
271 	/* Case: normalized floats -> zero and denormalized halfs */
272 	else if (exp16 <= 0)
273 	{
274 		/* Add the implicit leading 1 in mormalized float to mantissa. */
275 		mantissa |= 0x00800000u;
276 		/* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
277 		 * Need to discard the last 14-bits considering rounding mode.
278 		 * We also need to shift right -exp16 bits to encode the underflowed exponent.
279 		 */
280 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
281 		{
282 			mantissa >>= (14 - exp16);
283 		}
284 		else
285 		{
286 			/* mantissa in the above may exceed 10-bits, in which case overflow happens.
287 			 * The overflowed bit is automatically carried to exponent then.
288 			 */
289 			mantissa = roundToNearestEven(mantissa, 14 - exp16);
290 		}
291 		return (deFloat16) (sign | mantissa);
292 	}
293 	/* Case: normalized floats -> normalized floats */
294 	else if (exp16 <= 30)
295 	{
296 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
297 		{
298 			return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 13u));
299 		}
300 		else
301 		{
302 			mantissa	= roundToNearestEven(mantissa, 13);
303 			/* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
304 			exp16		= (exp16 << 10u) + (mantissa & (1 << 10));
305 			mantissa	&= (1u << 10) - 1;
306 			return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
307 		}
308 	}
309 	/* Case: normalized floats (too large to be representable as 16-bit floats) */
310 	else
311 	{
312 		/* According to IEEE Std 754-2008 Section 7.4,
313 		 * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
314 		 *   of the intermediate  result.
315 		 * * roundTowardZero carries all overflows to the format's largest finite number
316 		 *   with the sign of the intermediate result.
317 		 */
318 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
319 		{
320 			return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
321 		}
322 		else
323 		{
324 			return (deFloat16) (sign | (0x1f << 10));
325 		}
326 	}
327 
328 	/* Make compiler happy */
329 	return (deFloat16) 0;
330 }
331 
332 /*--------------------------------------------------------------------*//*!
333  * \brief Round the given number `val` to nearest even by discarding
334  *        the last `numBitsToDiscard` bits.
335  * \param val value to round
336  * \param numBitsToDiscard number of (least significant) bits to discard
337  * \return The rounded value with the last `numBitsToDiscard` removed
338  *//*--------------------------------------------------------------------*/
roundToNearestEven64(deUint64 val,const deUint64 numBitsToDiscard)339 static deUint64 roundToNearestEven64 (deUint64 val, const deUint64 numBitsToDiscard)
340 {
341 	const deUint64	lastBits	= val & (((deUint64)1 << numBitsToDiscard) - 1);
342 	const deUint64	headBit		= val & ((deUint64)1 << (numBitsToDiscard - 1));
343 
344 	DE_ASSERT(numBitsToDiscard > 0 && numBitsToDiscard < 64);	/* Make sure no overflow. */
345 	val >>= numBitsToDiscard;
346 
347 	if (headBit == 0)
348 	{
349 		return val;
350 	}
351 	else if (headBit == lastBits)
352 	{
353 		if ((val & 0x1) == 0x1)
354 		{
355 			return val + 1;
356 		}
357 		else
358 		{
359 			return val;
360 		}
361 	}
362 	else
363 	{
364 		return val + 1;
365 	}
366 }
367 
deFloat64To16Round(double val64,deRoundingMode mode)368 deFloat16 deFloat64To16Round (double val64, deRoundingMode mode)
369 {
370 	union
371 	{
372 		double		f;		/* Interpret as 64-bit float */
373 		deUint64	u;		/* Interpret as 64-bit unsigned integer */
374 	} x;
375 	deUint64	sign;		/* sign : 0000 0000 0000 0000 X000 0000 0000 0000 */
376 	deUint64	exp64;		/* exp32: biased exponent for 64-bit floats */
377 	int			exp16;		/* exp16: biased exponent for 16-bit floats */
378 	deUint64	mantissa;
379 
380 	/* We only support these two rounding modes for now */
381 	DE_ASSERT(mode == DE_ROUNDINGMODE_TO_ZERO || mode == DE_ROUNDINGMODE_TO_NEAREST_EVEN);
382 
383 	x.f			= val64;
384 	sign		= (x.u >> 48u) & 0x00008000u;
385 	exp64		= (x.u >> 52u) & 0x000007ffu;
386 	exp16		= (int) (exp64) - 1023 + 15;	/* 15/127: exponent bias for 16-bit/32-bit floats */
387 	mantissa	= x.u & 0x00fffffffffffffu;
388 
389 	/* Case: zero and denormalized floats */
390 	if (exp64 == 0)
391 	{
392 		/* Denormalized floats are < 2^(1-1023), not representable in 16-bit floats, rounding to zero. */
393 		return (deFloat16) sign;
394 	}
395 	/* Case: Inf and NaN */
396 	else if (exp64 == 0x000007ffu)
397 	{
398 		if (mantissa == 0u)
399 		{
400 			/* Inf */
401 			return (deFloat16) (sign | 0x7c00u);
402 		}
403 		else
404 		{
405 			/* NaN */
406 			mantissa >>= 42u;	/* 16-bit floats has 10-bit for mantissa, 42-bit less than 64-bit floats. */
407 			/* Make sure we don't turn NaN into zero by | (mantissa == 0). */
408 			return (deFloat16) (sign | 0x7c00u | mantissa | (mantissa == 0u));
409 		}
410 	}
411 	/* The following are cases for normalized floats.
412 	 *
413 	 * * If exp16 is less than 0, we are experiencing underflow for the exponent. To encode this underflowed exponent,
414 	 *   we can only shift the mantissa further right.
415 	 *   The real exponent is exp16 - 15. A denormalized 16-bit float can represent -14 via its exponent.
416 	 *   Note that the most significant bit in the mantissa of a denormalized float is already -1 as for exponent.
417 	 *   So, we just need to right shift the mantissa -exp16 bits.
418 	 * * If exp16 is 0, mantissa shifting requirement is similar to the above.
419 	 * * If exp16 is greater than 30 (0b11110), we are experiencing overflow for the exponent of 16-bit normalized floats.
420 	 */
421 	/* Case: normalized floats -> zero */
422 	else if (exp16 < -10)
423 	{
424 		/* 16-bit floats have only 10 bits for mantissa. Minimal 16-bit denormalized float is (2^-10) * (2^-14). */
425 		/* Expecting a number < (2^-10) * (2^-14) here, not representable, round to zero. */
426 		return (deFloat16) sign;
427 	}
428 	/* Case: normalized floats -> zero and denormalized halfs */
429 	else if (exp16 <= 0)
430 	{
431 		/* Add the implicit leading 1 in mormalized float to mantissa. */
432 		mantissa |= 0x0010000000000000u;
433 		/* We have a (23 + 1)-bit mantissa, but 16-bit floats only expect 10-bit mantissa.
434 		 * Need to discard the last 14-bits considering rounding mode.
435 		 * We also need to shift right -exp16 bits to encode the underflowed exponent.
436 		 */
437 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
438 		{
439 			mantissa >>= (43 - exp16);
440 		}
441 		else
442 		{
443 			/* mantissa in the above may exceed 10-bits, in which case overflow happens.
444 			 * The overflowed bit is automatically carried to exponent then.
445 			 */
446 			mantissa = roundToNearestEven64(mantissa, 43 - exp16);
447 		}
448 		return (deFloat16) (sign | mantissa);
449 	}
450 	/* Case: normalized floats -> normalized floats */
451 	else if (exp16 <= 30)
452 	{
453 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
454 		{
455 			return (deFloat16) (sign | ((deUint32)exp16 << 10u) | (mantissa >> 42u));
456 		}
457 		else
458 		{
459 			mantissa	= roundToNearestEven64(mantissa, 42);
460 			/* Handle overflow. exp16 may overflow (and become Inf) itself, but that's correct. */
461 			exp16		= (exp16 << 10u) + (deFloat16)(mantissa & (1 << 10));
462 			mantissa	&= (1u << 10) - 1;
463 			return (deFloat16) (sign | ((deUint32) exp16) | mantissa);
464 		}
465 	}
466 	/* Case: normalized floats (too large to be representable as 16-bit floats) */
467 	else
468 	{
469 		/* According to IEEE Std 754-2008 Section 7.4,
470 		 * * roundTiesToEven and roundTiesToAway carry all overflows to Inf with the sign
471 		 *   of the intermediate  result.
472 		 * * roundTowardZero carries all overflows to the format's largest finite number
473 		 *   with the sign of the intermediate result.
474 		 */
475 		if (mode == DE_ROUNDINGMODE_TO_ZERO)
476 		{
477 			return (deFloat16) (sign | 0x7bffu); /* 111 1011 1111 1111 */
478 		}
479 		else
480 		{
481 			return (deFloat16) (sign | (0x1f << 10));
482 		}
483 	}
484 
485 	/* Make compiler happy */
486 	return (deFloat16) 0;
487 }
488 
deFloat16To32(deFloat16 val16)489 float deFloat16To32 (deFloat16 val16)
490 {
491 	deUint32 sign;
492 	deUint32 expotent;
493 	deUint32 mantissa;
494 	union
495 	{
496 		float		f;
497 		deUint32	u;
498 	} x;
499 
500 	x.u			= 0u;
501 
502 	sign		= ((deUint32)val16 >> 15u) & 0x00000001u;
503 	expotent	= ((deUint32)val16 >> 10u) & 0x0000001fu;
504 	mantissa	= (deUint32)val16 & 0x000003ffu;
505 
506 	if (expotent == 0u)
507 	{
508 		if (mantissa == 0u)
509 		{
510 			/* +/- 0 */
511 			x.u = sign << 31u;
512 			return x.f;
513 		}
514 		else
515 		{
516 			/* Denormalized, normalize it. */
517 
518 			while (!(mantissa & 0x00000400u))
519 			{
520 				mantissa <<= 1u;
521 				expotent -=  1u;
522 			}
523 
524 			expotent += 1u;
525 			mantissa &= ~0x00000400u;
526 		}
527 	}
528 	else if (expotent == 31u)
529 	{
530 		if (mantissa == 0u)
531 		{
532 			/* +/- InF */
533 			x.u = (sign << 31u) | 0x7f800000u;
534 			return x.f;
535 		}
536 		else
537 		{
538 			/* +/- NaN */
539 			x.u = (sign << 31u) | 0x7f800000u | (mantissa << 13u);
540 			return x.f;
541 		}
542 	}
543 
544 	expotent = expotent + (127u - 15u);
545 	mantissa = mantissa << 13u;
546 
547 	x.u = (sign << 31u) | (expotent << 23u) | mantissa;
548 	return x.f;
549 }
550 
deFloat16To64(deFloat16 val16)551 double deFloat16To64 (deFloat16 val16)
552 {
553 	deUint64 sign;
554 	deUint64 expotent;
555 	deUint64 mantissa;
556 	union
557 	{
558 		double		f;
559 		deUint64	u;
560 	} x;
561 
562 	x.u			= 0u;
563 
564 	sign		= ((deUint32)val16 >> 15u) & 0x00000001u;
565 	expotent	= ((deUint32)val16 >> 10u) & 0x0000001fu;
566 	mantissa	= (deUint32)val16 & 0x000003ffu;
567 
568 	if (expotent == 0u)
569 	{
570 		if (mantissa == 0u)
571 		{
572 			/* +/- 0 */
573 			x.u = sign << 63u;
574 			return x.f;
575 		}
576 		else
577 		{
578 			/* Denormalized, normalize it. */
579 
580 			while (!(mantissa & 0x00000400u))
581 			{
582 				mantissa <<= 1u;
583 				expotent -=  1u;
584 			}
585 
586 			expotent += 1u;
587 			mantissa &= ~0x00000400u;
588 		}
589 	}
590 	else if (expotent == 31u)
591 	{
592 		if (mantissa == 0u)
593 		{
594 			/* +/- InF */
595 			x.u = (sign << 63u) | 0x7ff0000000000000u;
596 			return x.f;
597 		}
598 		else
599 		{
600 			/* +/- NaN */
601 			x.u = (sign << 63u) | 0x7ff0000000000000u | (mantissa << 42u);
602 			return x.f;
603 		}
604 	}
605 
606 	expotent = expotent + (1023u - 15u);
607 	mantissa = mantissa << 42u;
608 
609 	x.u = (sign << 63u) | (expotent << 52u) | mantissa;
610 	return x.f;
611 }
612 
613 DE_END_EXTERN_C
614