1 /* $NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
21 * float64u.
22 */
23
24 /*
25 ===============================================================================
26
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
29
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
39
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
45
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
50
51 ===============================================================================
52 */
53
54 #if defined(LIBC_SCCS) && !defined(lint)
55 __RCSID("$NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $");
56 #endif /* LIBC_SCCS and not lint */
57
58 #ifdef SOFTFLOAT_FOR_GCC
59 #include "softfloat-for-gcc.h"
60 #endif
61
62 #include "milieu.h"
63 #include "softfloat.h"
64
65 /*
66 * Conversions between floats as stored in memory and floats as
67 * SoftFloat uses them
68 */
69 #ifndef FLOAT64_DEMANGLE
70 #define FLOAT64_DEMANGLE(a) (a)
71 #endif
72 #ifndef FLOAT64_MANGLE
73 #define FLOAT64_MANGLE(a) (a)
74 #endif
75
76 /*
77 -------------------------------------------------------------------------------
78 Floating-point rounding mode and exception flags.
79 -------------------------------------------------------------------------------
80 */
81 #ifndef set_float_rounding_mode
82 fp_rnd float_rounding_mode = float_round_nearest_even;
83 fp_except float_exception_flags = 0;
84 #endif
85 #ifndef set_float_exception_inexact_flag
86 #define set_float_exception_inexact_flag() \
87 ((void)(float_exception_flags |= float_flag_inexact))
88 #endif
89
90 /*
91 -------------------------------------------------------------------------------
92 Primitive arithmetic functions, including multi-word arithmetic, and
93 division and square root approximations. (Can be specialized to target if
94 desired.)
95 -------------------------------------------------------------------------------
96 */
97 #include "softfloat-macros"
98
99 /*
100 -------------------------------------------------------------------------------
101 Functions and definitions to determine: (1) whether tininess for underflow
102 is detected before or after rounding by default, (2) what (if anything)
103 happens when exceptions are raised, (3) how signaling NaNs are distinguished
104 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
105 are propagated from function inputs to output. These details are target-
106 specific.
107 -------------------------------------------------------------------------------
108 */
109 #include "softfloat-specialize"
110
111 /*
112 -------------------------------------------------------------------------------
113 Returns the fraction bits of the single-precision floating-point value `a'.
114 -------------------------------------------------------------------------------
115 */
extractFloat32Frac(float32 a)116 INLINE bits32 extractFloat32Frac( float32 a )
117 {
118
119 return a & 0x007FFFFF;
120
121 }
122
123 /*
124 -------------------------------------------------------------------------------
125 Returns the exponent bits of the single-precision floating-point value `a'.
126 -------------------------------------------------------------------------------
127 */
extractFloat32Exp(float32 a)128 INLINE int16 extractFloat32Exp( float32 a )
129 {
130
131 return ( a>>23 ) & 0xFF;
132
133 }
134
135 /*
136 -------------------------------------------------------------------------------
137 Returns the sign bit of the single-precision floating-point value `a'.
138 -------------------------------------------------------------------------------
139 */
extractFloat32Sign(float32 a)140 INLINE flag extractFloat32Sign( float32 a )
141 {
142
143 return a>>31;
144
145 }
146
147 /*
148 -------------------------------------------------------------------------------
149 Normalizes the subnormal single-precision floating-point value represented
150 by the denormalized significand `aSig'. The normalized exponent and
151 significand are stored at the locations pointed to by `zExpPtr' and
152 `zSigPtr', respectively.
153 -------------------------------------------------------------------------------
154 */
155 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)156 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
157 {
158 int8 shiftCount;
159
160 shiftCount = countLeadingZeros32( aSig ) - 8;
161 *zSigPtr = aSig<<shiftCount;
162 *zExpPtr = 1 - shiftCount;
163
164 }
165
166 /*
167 -------------------------------------------------------------------------------
168 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
169 single-precision floating-point value, returning the result. After being
170 shifted into the proper positions, the three fields are simply added
171 together to form the result. This means that any integer portion of `zSig'
172 will be added into the exponent. Since a properly normalized significand
173 will have an integer portion equal to 1, the `zExp' input should be 1 less
174 than the desired result exponent whenever `zSig' is a complete, normalized
175 significand.
176 -------------------------------------------------------------------------------
177 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)178 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
179 {
180
181 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
182
183 }
184
185 /*
186 -------------------------------------------------------------------------------
187 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
188 and significand `zSig', and returns the proper single-precision floating-
189 point value corresponding to the abstract input. Ordinarily, the abstract
190 value is simply rounded and packed into the single-precision format, with
191 the inexact exception raised if the abstract input cannot be represented
192 exactly. However, if the abstract value is too large, the overflow and
193 inexact exceptions are raised and an infinity or maximal finite value is
194 returned. If the abstract value is too small, the input value is rounded to
195 a subnormal number, and the underflow and inexact exceptions are raised if
196 the abstract input cannot be represented exactly as a subnormal single-
197 precision floating-point number.
198 The input significand `zSig' has its binary point between bits 30
199 and 29, which is 7 bits to the left of the usual location. This shifted
200 significand must be normalized or smaller. If `zSig' is not normalized,
201 `zExp' must be 0; in that case, the result returned is a subnormal number,
202 and it must not require rounding. In the usual case that `zSig' is
203 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
204 The handling of underflow and overflow follows the IEC/IEEE Standard for
205 Binary Floating-Point Arithmetic.
206 -------------------------------------------------------------------------------
207 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)208 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
209 {
210 int8 roundingMode;
211 flag roundNearestEven;
212 int8 roundIncrement, roundBits;
213 flag isTiny;
214
215 roundingMode = float_rounding_mode;
216 roundNearestEven = roundingMode == float_round_nearest_even;
217 roundIncrement = 0x40;
218 if ( ! roundNearestEven ) {
219 if ( roundingMode == float_round_to_zero ) {
220 roundIncrement = 0;
221 }
222 else {
223 roundIncrement = 0x7F;
224 if ( zSign ) {
225 if ( roundingMode == float_round_up ) roundIncrement = 0;
226 }
227 else {
228 if ( roundingMode == float_round_down ) roundIncrement = 0;
229 }
230 }
231 }
232 roundBits = zSig & 0x7F;
233 if ( 0xFD <= (bits16) zExp ) {
234 if ( ( 0xFD < zExp )
235 || ( ( zExp == 0xFD )
236 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
237 ) {
238 float_raise( float_flag_overflow | float_flag_inexact );
239 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
240 }
241 if ( zExp < 0 ) {
242 isTiny =
243 ( float_detect_tininess == float_tininess_before_rounding )
244 || ( zExp < -1 )
245 || ( zSig + roundIncrement < (uint32)0x80000000 );
246 shift32RightJamming( zSig, - zExp, &zSig );
247 zExp = 0;
248 roundBits = zSig & 0x7F;
249 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
250 }
251 }
252 if ( roundBits ) set_float_exception_inexact_flag();
253 zSig = ( zSig + roundIncrement )>>7;
254 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
255 if ( zSig == 0 ) zExp = 0;
256 return packFloat32( zSign, zExp, zSig );
257
258 }
259
260 /*
261 -------------------------------------------------------------------------------
262 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
263 and significand `zSig', and returns the proper single-precision floating-
264 point value corresponding to the abstract input. This routine is just like
265 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
266 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
267 floating-point exponent.
268 -------------------------------------------------------------------------------
269 */
270 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)271 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
272 {
273 int8 shiftCount;
274
275 shiftCount = countLeadingZeros32( zSig ) - 1;
276 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
277
278 }
279
280 /*
281 -------------------------------------------------------------------------------
282 Returns the least-significant 32 fraction bits of the double-precision
283 floating-point value `a'.
284 -------------------------------------------------------------------------------
285 */
extractFloat64Frac1(float64 a)286 INLINE bits32 extractFloat64Frac1( float64 a )
287 {
288
289 return (bits32)(FLOAT64_DEMANGLE(a) & LIT64(0x00000000FFFFFFFF));
290
291 }
292
293 /*
294 -------------------------------------------------------------------------------
295 Returns the most-significant 20 fraction bits of the double-precision
296 floating-point value `a'.
297 -------------------------------------------------------------------------------
298 */
extractFloat64Frac0(float64 a)299 INLINE bits32 extractFloat64Frac0( float64 a )
300 {
301
302 return (bits32)((FLOAT64_DEMANGLE(a) >> 32) & 0x000FFFFF);
303
304 }
305
306 /*
307 -------------------------------------------------------------------------------
308 Returns the exponent bits of the double-precision floating-point value `a'.
309 -------------------------------------------------------------------------------
310 */
extractFloat64Exp(float64 a)311 INLINE int16 extractFloat64Exp( float64 a )
312 {
313
314 return (int16)((FLOAT64_DEMANGLE(a) >> 52) & 0x7FF);
315
316 }
317
318 /*
319 -------------------------------------------------------------------------------
320 Returns the sign bit of the double-precision floating-point value `a'.
321 -------------------------------------------------------------------------------
322 */
extractFloat64Sign(float64 a)323 INLINE flag extractFloat64Sign( float64 a )
324 {
325
326 return (flag)(FLOAT64_DEMANGLE(a) >> 63);
327
328 }
329
330 /*
331 -------------------------------------------------------------------------------
332 Normalizes the subnormal double-precision floating-point value represented
333 by the denormalized significand formed by the concatenation of `aSig0' and
334 `aSig1'. The normalized exponent is stored at the location pointed to by
335 `zExpPtr'. The most significant 21 bits of the normalized significand are
336 stored at the location pointed to by `zSig0Ptr', and the least significant
337 32 bits of the normalized significand are stored at the location pointed to
338 by `zSig1Ptr'.
339 -------------------------------------------------------------------------------
340 */
341 static void
normalizeFloat64Subnormal(bits32 aSig0,bits32 aSig1,int16 * zExpPtr,bits32 * zSig0Ptr,bits32 * zSig1Ptr)342 normalizeFloat64Subnormal(
343 bits32 aSig0,
344 bits32 aSig1,
345 int16 *zExpPtr,
346 bits32 *zSig0Ptr,
347 bits32 *zSig1Ptr
348 )
349 {
350 int8 shiftCount;
351
352 if ( aSig0 == 0 ) {
353 shiftCount = countLeadingZeros32( aSig1 ) - 11;
354 if ( shiftCount < 0 ) {
355 *zSig0Ptr = aSig1>>( - shiftCount );
356 *zSig1Ptr = aSig1<<( shiftCount & 31 );
357 }
358 else {
359 *zSig0Ptr = aSig1<<shiftCount;
360 *zSig1Ptr = 0;
361 }
362 *zExpPtr = - shiftCount - 31;
363 }
364 else {
365 shiftCount = countLeadingZeros32( aSig0 ) - 11;
366 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
367 *zExpPtr = 1 - shiftCount;
368 }
369
370 }
371
372 /*
373 -------------------------------------------------------------------------------
374 Packs the sign `zSign', the exponent `zExp', and the significand formed by
375 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
376 point value, returning the result. After being shifted into the proper
377 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
378 together to form the most significant 32 bits of the result. This means
379 that any integer portion of `zSig0' will be added into the exponent. Since
380 a properly normalized significand will have an integer portion equal to 1,
381 the `zExp' input should be 1 less than the desired result exponent whenever
382 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
383 -------------------------------------------------------------------------------
384 */
385 INLINE float64
packFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)386 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
387 {
388
389 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
390 ( ( (bits64) zExp )<<52 ) +
391 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
392
393
394 }
395
396 /*
397 -------------------------------------------------------------------------------
398 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
399 and extended significand formed by the concatenation of `zSig0', `zSig1',
400 and `zSig2', and returns the proper double-precision floating-point value
401 corresponding to the abstract input. Ordinarily, the abstract value is
402 simply rounded and packed into the double-precision format, with the inexact
403 exception raised if the abstract input cannot be represented exactly.
404 However, if the abstract value is too large, the overflow and inexact
405 exceptions are raised and an infinity or maximal finite value is returned.
406 If the abstract value is too small, the input value is rounded to a
407 subnormal number, and the underflow and inexact exceptions are raised if the
408 abstract input cannot be represented exactly as a subnormal double-precision
409 floating-point number.
410 The input significand must be normalized or smaller. If the input
411 significand is not normalized, `zExp' must be 0; in that case, the result
412 returned is a subnormal number, and it must not require rounding. In the
413 usual case that the input significand is normalized, `zExp' must be 1 less
414 than the ``true'' floating-point exponent. The handling of underflow and
415 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
416 -------------------------------------------------------------------------------
417 */
418 static float64
roundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1,bits32 zSig2)419 roundAndPackFloat64(
420 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
421 {
422 int8 roundingMode;
423 flag roundNearestEven, increment, isTiny;
424
425 roundingMode = float_rounding_mode;
426 roundNearestEven = ( roundingMode == float_round_nearest_even );
427 increment = ( (sbits32) zSig2 < 0 );
428 if ( ! roundNearestEven ) {
429 if ( roundingMode == float_round_to_zero ) {
430 increment = 0;
431 }
432 else {
433 if ( zSign ) {
434 increment = ( roundingMode == float_round_down ) && zSig2;
435 }
436 else {
437 increment = ( roundingMode == float_round_up ) && zSig2;
438 }
439 }
440 }
441 if ( 0x7FD <= (bits16) zExp ) {
442 if ( ( 0x7FD < zExp )
443 || ( ( zExp == 0x7FD )
444 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
445 && increment
446 )
447 ) {
448 float_raise( float_flag_overflow | float_flag_inexact );
449 if ( ( roundingMode == float_round_to_zero )
450 || ( zSign && ( roundingMode == float_round_up ) )
451 || ( ! zSign && ( roundingMode == float_round_down ) )
452 ) {
453 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
454 }
455 return packFloat64( zSign, 0x7FF, 0, 0 );
456 }
457 if ( zExp < 0 ) {
458 isTiny =
459 ( float_detect_tininess == float_tininess_before_rounding )
460 || ( zExp < -1 )
461 || ! increment
462 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
463 shift64ExtraRightJamming(
464 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
465 zExp = 0;
466 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
467 if ( roundNearestEven ) {
468 increment = ( (sbits32) zSig2 < 0 );
469 }
470 else {
471 if ( zSign ) {
472 increment = ( roundingMode == float_round_down ) && zSig2;
473 }
474 else {
475 increment = ( roundingMode == float_round_up ) && zSig2;
476 }
477 }
478 }
479 }
480 if ( zSig2 ) set_float_exception_inexact_flag();
481 if ( increment ) {
482 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
483 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
484 }
485 else {
486 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
487 }
488 return packFloat64( zSign, zExp, zSig0, zSig1 );
489
490 }
491
492 /*
493 -------------------------------------------------------------------------------
494 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
495 and significand formed by the concatenation of `zSig0' and `zSig1', and
496 returns the proper double-precision floating-point value corresponding
497 to the abstract input. This routine is just like `roundAndPackFloat64'
498 except that the input significand has fewer bits and does not have to be
499 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
500 point exponent.
501 -------------------------------------------------------------------------------
502 */
503 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)504 normalizeRoundAndPackFloat64(
505 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
506 {
507 int8 shiftCount;
508 bits32 zSig2;
509
510 if ( zSig0 == 0 ) {
511 zSig0 = zSig1;
512 zSig1 = 0;
513 zExp -= 32;
514 }
515 shiftCount = countLeadingZeros32( zSig0 ) - 11;
516 if ( 0 <= shiftCount ) {
517 zSig2 = 0;
518 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
519 }
520 else {
521 shift64ExtraRightJamming(
522 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
523 }
524 zExp -= shiftCount;
525 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
526
527 }
528
529 /*
530 -------------------------------------------------------------------------------
531 Returns the result of converting the 32-bit two's complement integer `a' to
532 the single-precision floating-point format. The conversion is performed
533 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
534 -------------------------------------------------------------------------------
535 */
int32_to_float32(int32 a)536 float32 int32_to_float32( int32 a )
537 {
538 flag zSign;
539
540 if ( a == 0 ) return 0;
541 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
542 zSign = ( a < 0 );
543 return normalizeRoundAndPackFloat32(zSign, 0x9C, (uint32)(zSign ? - a : a));
544
545 }
546
547 /*
548 -------------------------------------------------------------------------------
549 Returns the result of converting the 32-bit two's complement integer `a' to
550 the double-precision floating-point format. The conversion is performed
551 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
552 -------------------------------------------------------------------------------
553 */
int32_to_float64(int32 a)554 float64 int32_to_float64( int32 a )
555 {
556 flag zSign;
557 bits32 absA;
558 int8 shiftCount;
559 bits32 zSig0, zSig1;
560
561 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
562 zSign = ( a < 0 );
563 absA = zSign ? - a : a;
564 shiftCount = countLeadingZeros32( absA ) - 11;
565 if ( 0 <= shiftCount ) {
566 zSig0 = absA<<shiftCount;
567 zSig1 = 0;
568 }
569 else {
570 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
571 }
572 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
573
574 }
575
576 #ifndef SOFTFLOAT_FOR_GCC
577 /*
578 -------------------------------------------------------------------------------
579 Returns the result of converting the single-precision floating-point value
580 `a' to the 32-bit two's complement integer format. The conversion is
581 performed according to the IEC/IEEE Standard for Binary Floating-Point
582 Arithmetic---which means in particular that the conversion is rounded
583 according to the current rounding mode. If `a' is a NaN, the largest
584 positive integer is returned. Otherwise, if the conversion overflows, the
585 largest integer with the same sign as `a' is returned.
586 -------------------------------------------------------------------------------
587 */
float32_to_int32(float32 a)588 int32 float32_to_int32( float32 a )
589 {
590 flag aSign;
591 int16 aExp, shiftCount;
592 bits32 aSig, aSigExtra;
593 int32 z;
594 int8 roundingMode;
595
596 aSig = extractFloat32Frac( a );
597 aExp = extractFloat32Exp( a );
598 aSign = extractFloat32Sign( a );
599 shiftCount = aExp - 0x96;
600 if ( 0 <= shiftCount ) {
601 if ( 0x9E <= aExp ) {
602 if ( a != 0xCF000000 ) {
603 float_raise( float_flag_invalid );
604 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
605 return 0x7FFFFFFF;
606 }
607 }
608 return (sbits32) 0x80000000;
609 }
610 z = ( aSig | 0x00800000 )<<shiftCount;
611 if ( aSign ) z = - z;
612 }
613 else {
614 if ( aExp < 0x7E ) {
615 aSigExtra = aExp | aSig;
616 z = 0;
617 }
618 else {
619 aSig |= 0x00800000;
620 aSigExtra = aSig<<( shiftCount & 31 );
621 z = aSig>>( - shiftCount );
622 }
623 if ( aSigExtra ) set_float_exception_inexact_flag();
624 roundingMode = float_rounding_mode;
625 if ( roundingMode == float_round_nearest_even ) {
626 if ( (sbits32) aSigExtra < 0 ) {
627 ++z;
628 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
629 }
630 if ( aSign ) z = - z;
631 }
632 else {
633 aSigExtra = ( aSigExtra != 0 );
634 if ( aSign ) {
635 z += ( roundingMode == float_round_down ) & aSigExtra;
636 z = - z;
637 }
638 else {
639 z += ( roundingMode == float_round_up ) & aSigExtra;
640 }
641 }
642 }
643 return z;
644
645 }
646 #endif
647
648 /*
649 -------------------------------------------------------------------------------
650 Returns the result of converting the single-precision floating-point value
651 `a' to the 32-bit two's complement integer format. The conversion is
652 performed according to the IEC/IEEE Standard for Binary Floating-Point
653 Arithmetic, except that the conversion is always rounded toward zero.
654 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
655 the conversion overflows, the largest integer with the same sign as `a' is
656 returned.
657 -------------------------------------------------------------------------------
658 */
float32_to_int32_round_to_zero(float32 a)659 int32 float32_to_int32_round_to_zero( float32 a )
660 {
661 flag aSign;
662 int16 aExp, shiftCount;
663 bits32 aSig;
664 int32 z;
665
666 aSig = extractFloat32Frac( a );
667 aExp = extractFloat32Exp( a );
668 aSign = extractFloat32Sign( a );
669 shiftCount = aExp - 0x9E;
670 if ( 0 <= shiftCount ) {
671 if ( a != 0xCF000000 ) {
672 float_raise( float_flag_invalid );
673 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
674 }
675 return (sbits32) 0x80000000;
676 }
677 else if ( aExp <= 0x7E ) {
678 if ( aExp | aSig ) set_float_exception_inexact_flag();
679 return 0;
680 }
681 aSig = ( aSig | 0x00800000 )<<8;
682 z = aSig>>( - shiftCount );
683 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
684 set_float_exception_inexact_flag();
685 }
686 if ( aSign ) z = - z;
687 return z;
688
689 }
690
691 /*
692 -------------------------------------------------------------------------------
693 Returns the result of converting the single-precision floating-point value
694 `a' to the double-precision floating-point format. The conversion is
695 performed according to the IEC/IEEE Standard for Binary Floating-Point
696 Arithmetic.
697 -------------------------------------------------------------------------------
698 */
float32_to_float64(float32 a)699 float64 float32_to_float64( float32 a )
700 {
701 flag aSign;
702 int16 aExp;
703 bits32 aSig, zSig0, zSig1;
704
705 aSig = extractFloat32Frac( a );
706 aExp = extractFloat32Exp( a );
707 aSign = extractFloat32Sign( a );
708 if ( aExp == 0xFF ) {
709 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
710 return packFloat64( aSign, 0x7FF, 0, 0 );
711 }
712 if ( aExp == 0 ) {
713 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
714 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
715 --aExp;
716 }
717 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
718 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
719
720 }
721
722 #ifndef SOFTFLOAT_FOR_GCC
723 /*
724 -------------------------------------------------------------------------------
725 Rounds the single-precision floating-point value `a' to an integer,
726 and returns the result as a single-precision floating-point value. The
727 operation is performed according to the IEC/IEEE Standard for Binary
728 Floating-Point Arithmetic.
729 -------------------------------------------------------------------------------
730 */
float32_round_to_int(float32 a)731 float32 float32_round_to_int( float32 a )
732 {
733 flag aSign;
734 int16 aExp;
735 bits32 lastBitMask, roundBitsMask;
736 int8 roundingMode;
737 float32 z;
738
739 aExp = extractFloat32Exp( a );
740 if ( 0x96 <= aExp ) {
741 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
742 return propagateFloat32NaN( a, a );
743 }
744 return a;
745 }
746 if ( aExp <= 0x7E ) {
747 if ( (bits32) ( a<<1 ) == 0 ) return a;
748 set_float_exception_inexact_flag();
749 aSign = extractFloat32Sign( a );
750 switch ( float_rounding_mode ) {
751 case float_round_nearest_even:
752 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
753 return packFloat32( aSign, 0x7F, 0 );
754 }
755 break;
756 case float_round_to_zero:
757 break;
758 case float_round_down:
759 return aSign ? 0xBF800000 : 0;
760 case float_round_up:
761 return aSign ? 0x80000000 : 0x3F800000;
762 }
763 return packFloat32( aSign, 0, 0 );
764 }
765 lastBitMask = 1;
766 lastBitMask <<= 0x96 - aExp;
767 roundBitsMask = lastBitMask - 1;
768 z = a;
769 roundingMode = float_rounding_mode;
770 if ( roundingMode == float_round_nearest_even ) {
771 z += lastBitMask>>1;
772 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
773 }
774 else if ( roundingMode != float_round_to_zero ) {
775 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
776 z += roundBitsMask;
777 }
778 }
779 z &= ~ roundBitsMask;
780 if ( z != a ) set_float_exception_inexact_flag();
781 return z;
782
783 }
784 #endif
785
786 /*
787 -------------------------------------------------------------------------------
788 Returns the result of adding the absolute values of the single-precision
789 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
790 before being returned. `zSign' is ignored if the result is a NaN.
791 The addition is performed according to the IEC/IEEE Standard for Binary
792 Floating-Point Arithmetic.
793 -------------------------------------------------------------------------------
794 */
addFloat32Sigs(float32 a,float32 b,flag zSign)795 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
796 {
797 int16 aExp, bExp, zExp;
798 bits32 aSig, bSig, zSig;
799 int16 expDiff;
800
801 aSig = extractFloat32Frac( a );
802 aExp = extractFloat32Exp( a );
803 bSig = extractFloat32Frac( b );
804 bExp = extractFloat32Exp( b );
805 expDiff = aExp - bExp;
806 aSig <<= 6;
807 bSig <<= 6;
808 if ( 0 < expDiff ) {
809 if ( aExp == 0xFF ) {
810 if ( aSig ) return propagateFloat32NaN( a, b );
811 return a;
812 }
813 if ( bExp == 0 ) {
814 --expDiff;
815 }
816 else {
817 bSig |= 0x20000000;
818 }
819 shift32RightJamming( bSig, expDiff, &bSig );
820 zExp = aExp;
821 }
822 else if ( expDiff < 0 ) {
823 if ( bExp == 0xFF ) {
824 if ( bSig ) return propagateFloat32NaN( a, b );
825 return packFloat32( zSign, 0xFF, 0 );
826 }
827 if ( aExp == 0 ) {
828 ++expDiff;
829 }
830 else {
831 aSig |= 0x20000000;
832 }
833 shift32RightJamming( aSig, - expDiff, &aSig );
834 zExp = bExp;
835 }
836 else {
837 if ( aExp == 0xFF ) {
838 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
839 return a;
840 }
841 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
842 zSig = 0x40000000 + aSig + bSig;
843 zExp = aExp;
844 goto roundAndPack;
845 }
846 aSig |= 0x20000000;
847 zSig = ( aSig + bSig )<<1;
848 --zExp;
849 if ( (sbits32) zSig < 0 ) {
850 zSig = aSig + bSig;
851 ++zExp;
852 }
853 roundAndPack:
854 return roundAndPackFloat32( zSign, zExp, zSig );
855
856 }
857
858 /*
859 -------------------------------------------------------------------------------
860 Returns the result of subtracting the absolute values of the single-
861 precision floating-point values `a' and `b'. If `zSign' is 1, the
862 difference is negated before being returned. `zSign' is ignored if the
863 result is a NaN. The subtraction is performed according to the IEC/IEEE
864 Standard for Binary Floating-Point Arithmetic.
865 -------------------------------------------------------------------------------
866 */
subFloat32Sigs(float32 a,float32 b,flag zSign)867 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
868 {
869 int16 aExp, bExp, zExp;
870 bits32 aSig, bSig, zSig;
871 int16 expDiff;
872
873 aSig = extractFloat32Frac( a );
874 aExp = extractFloat32Exp( a );
875 bSig = extractFloat32Frac( b );
876 bExp = extractFloat32Exp( b );
877 expDiff = aExp - bExp;
878 aSig <<= 7;
879 bSig <<= 7;
880 if ( 0 < expDiff ) goto aExpBigger;
881 if ( expDiff < 0 ) goto bExpBigger;
882 if ( aExp == 0xFF ) {
883 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
884 float_raise( float_flag_invalid );
885 return float32_default_nan;
886 }
887 if ( aExp == 0 ) {
888 aExp = 1;
889 bExp = 1;
890 }
891 if ( bSig < aSig ) goto aBigger;
892 if ( aSig < bSig ) goto bBigger;
893 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
894 bExpBigger:
895 if ( bExp == 0xFF ) {
896 if ( bSig ) return propagateFloat32NaN( a, b );
897 return packFloat32( zSign ^ 1, 0xFF, 0 );
898 }
899 if ( aExp == 0 ) {
900 ++expDiff;
901 }
902 else {
903 aSig |= 0x40000000;
904 }
905 shift32RightJamming( aSig, - expDiff, &aSig );
906 bSig |= 0x40000000;
907 bBigger:
908 zSig = bSig - aSig;
909 zExp = bExp;
910 zSign ^= 1;
911 goto normalizeRoundAndPack;
912 aExpBigger:
913 if ( aExp == 0xFF ) {
914 if ( aSig ) return propagateFloat32NaN( a, b );
915 return a;
916 }
917 if ( bExp == 0 ) {
918 --expDiff;
919 }
920 else {
921 bSig |= 0x40000000;
922 }
923 shift32RightJamming( bSig, expDiff, &bSig );
924 aSig |= 0x40000000;
925 aBigger:
926 zSig = aSig - bSig;
927 zExp = aExp;
928 normalizeRoundAndPack:
929 --zExp;
930 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
931
932 }
933
934 /*
935 -------------------------------------------------------------------------------
936 Returns the result of adding the single-precision floating-point values `a'
937 and `b'. The operation is performed according to the IEC/IEEE Standard for
938 Binary Floating-Point Arithmetic.
939 -------------------------------------------------------------------------------
940 */
float32_add(float32 a,float32 b)941 float32 float32_add( float32 a, float32 b )
942 {
943 flag aSign, bSign;
944
945 aSign = extractFloat32Sign( a );
946 bSign = extractFloat32Sign( b );
947 if ( aSign == bSign ) {
948 return addFloat32Sigs( a, b, aSign );
949 }
950 else {
951 return subFloat32Sigs( a, b, aSign );
952 }
953
954 }
955
956 /*
957 -------------------------------------------------------------------------------
958 Returns the result of subtracting the single-precision floating-point values
959 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
960 for Binary Floating-Point Arithmetic.
961 -------------------------------------------------------------------------------
962 */
float32_sub(float32 a,float32 b)963 float32 float32_sub( float32 a, float32 b )
964 {
965 flag aSign, bSign;
966
967 aSign = extractFloat32Sign( a );
968 bSign = extractFloat32Sign( b );
969 if ( aSign == bSign ) {
970 return subFloat32Sigs( a, b, aSign );
971 }
972 else {
973 return addFloat32Sigs( a, b, aSign );
974 }
975
976 }
977
978 /*
979 -------------------------------------------------------------------------------
980 Returns the result of multiplying the single-precision floating-point values
981 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
982 for Binary Floating-Point Arithmetic.
983 -------------------------------------------------------------------------------
984 */
float32_mul(float32 a,float32 b)985 float32 float32_mul( float32 a, float32 b )
986 {
987 flag aSign, bSign, zSign;
988 int16 aExp, bExp, zExp;
989 bits32 aSig, bSig, zSig0, zSig1;
990
991 aSig = extractFloat32Frac( a );
992 aExp = extractFloat32Exp( a );
993 aSign = extractFloat32Sign( a );
994 bSig = extractFloat32Frac( b );
995 bExp = extractFloat32Exp( b );
996 bSign = extractFloat32Sign( b );
997 zSign = aSign ^ bSign;
998 if ( aExp == 0xFF ) {
999 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1000 return propagateFloat32NaN( a, b );
1001 }
1002 if ( ( bExp | bSig ) == 0 ) {
1003 float_raise( float_flag_invalid );
1004 return float32_default_nan;
1005 }
1006 return packFloat32( zSign, 0xFF, 0 );
1007 }
1008 if ( bExp == 0xFF ) {
1009 if ( bSig ) return propagateFloat32NaN( a, b );
1010 if ( ( aExp | aSig ) == 0 ) {
1011 float_raise( float_flag_invalid );
1012 return float32_default_nan;
1013 }
1014 return packFloat32( zSign, 0xFF, 0 );
1015 }
1016 if ( aExp == 0 ) {
1017 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1018 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1019 }
1020 if ( bExp == 0 ) {
1021 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1022 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1023 }
1024 zExp = aExp + bExp - 0x7F;
1025 aSig = ( aSig | 0x00800000 )<<7;
1026 bSig = ( bSig | 0x00800000 )<<8;
1027 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1028 zSig0 |= ( zSig1 != 0 );
1029 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1030 zSig0 <<= 1;
1031 --zExp;
1032 }
1033 return roundAndPackFloat32( zSign, zExp, zSig0 );
1034
1035 }
1036
1037 /*
1038 -------------------------------------------------------------------------------
1039 Returns the result of dividing the single-precision floating-point value `a'
1040 by the corresponding value `b'. The operation is performed according to the
1041 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1042 -------------------------------------------------------------------------------
1043 */
float32_div(float32 a,float32 b)1044 float32 float32_div( float32 a, float32 b )
1045 {
1046 flag aSign, bSign, zSign;
1047 int16 aExp, bExp, zExp;
1048 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1049
1050 aSig = extractFloat32Frac( a );
1051 aExp = extractFloat32Exp( a );
1052 aSign = extractFloat32Sign( a );
1053 bSig = extractFloat32Frac( b );
1054 bExp = extractFloat32Exp( b );
1055 bSign = extractFloat32Sign( b );
1056 zSign = aSign ^ bSign;
1057 if ( aExp == 0xFF ) {
1058 if ( aSig ) return propagateFloat32NaN( a, b );
1059 if ( bExp == 0xFF ) {
1060 if ( bSig ) return propagateFloat32NaN( a, b );
1061 float_raise( float_flag_invalid );
1062 return float32_default_nan;
1063 }
1064 return packFloat32( zSign, 0xFF, 0 );
1065 }
1066 if ( bExp == 0xFF ) {
1067 if ( bSig ) return propagateFloat32NaN( a, b );
1068 return packFloat32( zSign, 0, 0 );
1069 }
1070 if ( bExp == 0 ) {
1071 if ( bSig == 0 ) {
1072 if ( ( aExp | aSig ) == 0 ) {
1073 float_raise( float_flag_invalid );
1074 return float32_default_nan;
1075 }
1076 float_raise( float_flag_divbyzero );
1077 return packFloat32( zSign, 0xFF, 0 );
1078 }
1079 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1080 }
1081 if ( aExp == 0 ) {
1082 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1083 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1084 }
1085 zExp = aExp - bExp + 0x7D;
1086 aSig = ( aSig | 0x00800000 )<<7;
1087 bSig = ( bSig | 0x00800000 )<<8;
1088 if ( bSig <= ( aSig + aSig ) ) {
1089 aSig >>= 1;
1090 ++zExp;
1091 }
1092 zSig = estimateDiv64To32( aSig, 0, bSig );
1093 if ( ( zSig & 0x3F ) <= 2 ) {
1094 mul32To64( bSig, zSig, &term0, &term1 );
1095 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1096 while ( (sbits32) rem0 < 0 ) {
1097 --zSig;
1098 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1099 }
1100 zSig |= ( rem1 != 0 );
1101 }
1102 return roundAndPackFloat32( zSign, zExp, zSig );
1103
1104 }
1105
1106 #ifndef SOFTFLOAT_FOR_GCC
1107 /*
1108 -------------------------------------------------------------------------------
1109 Returns the remainder of the single-precision floating-point value `a'
1110 with respect to the corresponding value `b'. The operation is performed
1111 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1112 -------------------------------------------------------------------------------
1113 */
float32_rem(float32 a,float32 b)1114 float32 float32_rem( float32 a, float32 b )
1115 {
1116 flag aSign, bSign, zSign;
1117 int16 aExp, bExp, expDiff;
1118 bits32 aSig, bSig, q, allZero, alternateASig;
1119 sbits32 sigMean;
1120
1121 aSig = extractFloat32Frac( a );
1122 aExp = extractFloat32Exp( a );
1123 aSign = extractFloat32Sign( a );
1124 bSig = extractFloat32Frac( b );
1125 bExp = extractFloat32Exp( b );
1126 bSign = extractFloat32Sign( b );
1127 if ( aExp == 0xFF ) {
1128 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1129 return propagateFloat32NaN( a, b );
1130 }
1131 float_raise( float_flag_invalid );
1132 return float32_default_nan;
1133 }
1134 if ( bExp == 0xFF ) {
1135 if ( bSig ) return propagateFloat32NaN( a, b );
1136 return a;
1137 }
1138 if ( bExp == 0 ) {
1139 if ( bSig == 0 ) {
1140 float_raise( float_flag_invalid );
1141 return float32_default_nan;
1142 }
1143 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1144 }
1145 if ( aExp == 0 ) {
1146 if ( aSig == 0 ) return a;
1147 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1148 }
1149 expDiff = aExp - bExp;
1150 aSig = ( aSig | 0x00800000 )<<8;
1151 bSig = ( bSig | 0x00800000 )<<8;
1152 if ( expDiff < 0 ) {
1153 if ( expDiff < -1 ) return a;
1154 aSig >>= 1;
1155 }
1156 q = ( bSig <= aSig );
1157 if ( q ) aSig -= bSig;
1158 expDiff -= 32;
1159 while ( 0 < expDiff ) {
1160 q = estimateDiv64To32( aSig, 0, bSig );
1161 q = ( 2 < q ) ? q - 2 : 0;
1162 aSig = - ( ( bSig>>2 ) * q );
1163 expDiff -= 30;
1164 }
1165 expDiff += 32;
1166 if ( 0 < expDiff ) {
1167 q = estimateDiv64To32( aSig, 0, bSig );
1168 q = ( 2 < q ) ? q - 2 : 0;
1169 q >>= 32 - expDiff;
1170 bSig >>= 2;
1171 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1172 }
1173 else {
1174 aSig >>= 2;
1175 bSig >>= 2;
1176 }
1177 do {
1178 alternateASig = aSig;
1179 ++q;
1180 aSig -= bSig;
1181 } while ( 0 <= (sbits32) aSig );
1182 sigMean = aSig + alternateASig;
1183 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1184 aSig = alternateASig;
1185 }
1186 zSign = ( (sbits32) aSig < 0 );
1187 if ( zSign ) aSig = - aSig;
1188 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1189
1190 }
1191 #endif
1192
1193 #ifndef SOFTFLOAT_FOR_GCC
1194 /*
1195 -------------------------------------------------------------------------------
1196 Returns the square root of the single-precision floating-point value `a'.
1197 The operation is performed according to the IEC/IEEE Standard for Binary
1198 Floating-Point Arithmetic.
1199 -------------------------------------------------------------------------------
1200 */
float32_sqrt(float32 a)1201 float32 float32_sqrt( float32 a )
1202 {
1203 flag aSign;
1204 int16 aExp, zExp;
1205 bits32 aSig, zSig, rem0, rem1, term0, term1;
1206
1207 aSig = extractFloat32Frac( a );
1208 aExp = extractFloat32Exp( a );
1209 aSign = extractFloat32Sign( a );
1210 if ( aExp == 0xFF ) {
1211 if ( aSig ) return propagateFloat32NaN( a, 0 );
1212 if ( ! aSign ) return a;
1213 float_raise( float_flag_invalid );
1214 return float32_default_nan;
1215 }
1216 if ( aSign ) {
1217 if ( ( aExp | aSig ) == 0 ) return a;
1218 float_raise( float_flag_invalid );
1219 return float32_default_nan;
1220 }
1221 if ( aExp == 0 ) {
1222 if ( aSig == 0 ) return 0;
1223 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1224 }
1225 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1226 aSig = ( aSig | 0x00800000 )<<8;
1227 zSig = estimateSqrt32( aExp, aSig ) + 2;
1228 if ( ( zSig & 0x7F ) <= 5 ) {
1229 if ( zSig < 2 ) {
1230 zSig = 0x7FFFFFFF;
1231 goto roundAndPack;
1232 }
1233 else {
1234 aSig >>= aExp & 1;
1235 mul32To64( zSig, zSig, &term0, &term1 );
1236 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1237 while ( (sbits32) rem0 < 0 ) {
1238 --zSig;
1239 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1240 term1 |= 1;
1241 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1242 }
1243 zSig |= ( ( rem0 | rem1 ) != 0 );
1244 }
1245 }
1246 shift32RightJamming( zSig, 1, &zSig );
1247 roundAndPack:
1248 return roundAndPackFloat32( 0, zExp, zSig );
1249
1250 }
1251 #endif
1252
1253 /*
1254 -------------------------------------------------------------------------------
1255 Returns 1 if the single-precision floating-point value `a' is equal to
1256 the corresponding value `b', and 0 otherwise. The comparison is performed
1257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1258 -------------------------------------------------------------------------------
1259 */
float32_eq(float32 a,float32 b)1260 flag float32_eq( float32 a, float32 b )
1261 {
1262
1263 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1264 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1265 ) {
1266 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1267 float_raise( float_flag_invalid );
1268 }
1269 return 0;
1270 }
1271 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1272
1273 }
1274
1275 /*
1276 -------------------------------------------------------------------------------
1277 Returns 1 if the single-precision floating-point value `a' is less than
1278 or equal to the corresponding value `b', and 0 otherwise. The comparison
1279 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 Arithmetic.
1281 -------------------------------------------------------------------------------
1282 */
float32_le(float32 a,float32 b)1283 flag float32_le( float32 a, float32 b )
1284 {
1285 flag aSign, bSign;
1286
1287 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1288 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1289 ) {
1290 float_raise( float_flag_invalid );
1291 return 0;
1292 }
1293 aSign = extractFloat32Sign( a );
1294 bSign = extractFloat32Sign( b );
1295 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1296 return ( a == b ) || ( aSign ^ ( a < b ) );
1297
1298 }
1299
1300 /*
1301 -------------------------------------------------------------------------------
1302 Returns 1 if the single-precision floating-point value `a' is less than
1303 the corresponding value `b', and 0 otherwise. The comparison is performed
1304 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 -------------------------------------------------------------------------------
1306 */
float32_lt(float32 a,float32 b)1307 flag float32_lt( float32 a, float32 b )
1308 {
1309 flag aSign, bSign;
1310
1311 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1312 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1313 ) {
1314 float_raise( float_flag_invalid );
1315 return 0;
1316 }
1317 aSign = extractFloat32Sign( a );
1318 bSign = extractFloat32Sign( b );
1319 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1320 return ( a != b ) && ( aSign ^ ( a < b ) );
1321
1322 }
1323
1324 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1325 /*
1326 -------------------------------------------------------------------------------
1327 Returns 1 if the single-precision floating-point value `a' is equal to
1328 the corresponding value `b', and 0 otherwise. The invalid exception is
1329 raised if either operand is a NaN. Otherwise, the comparison is performed
1330 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1331 -------------------------------------------------------------------------------
1332 */
float32_eq_signaling(float32 a,float32 b)1333 flag float32_eq_signaling( float32 a, float32 b )
1334 {
1335
1336 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1337 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1338 ) {
1339 float_raise( float_flag_invalid );
1340 return 0;
1341 }
1342 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1343
1344 }
1345
1346 /*
1347 -------------------------------------------------------------------------------
1348 Returns 1 if the single-precision floating-point value `a' is less than or
1349 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1350 cause an exception. Otherwise, the comparison is performed according to the
1351 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1352 -------------------------------------------------------------------------------
1353 */
float32_le_quiet(float32 a,float32 b)1354 flag float32_le_quiet( float32 a, float32 b )
1355 {
1356 flag aSign, bSign;
1357 int16 aExp, bExp;
1358
1359 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1360 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1361 ) {
1362 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1363 float_raise( float_flag_invalid );
1364 }
1365 return 0;
1366 }
1367 aSign = extractFloat32Sign( a );
1368 bSign = extractFloat32Sign( b );
1369 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1370 return ( a == b ) || ( aSign ^ ( a < b ) );
1371
1372 }
1373
1374 /*
1375 -------------------------------------------------------------------------------
1376 Returns 1 if the single-precision floating-point value `a' is less than
1377 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1378 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1379 Standard for Binary Floating-Point Arithmetic.
1380 -------------------------------------------------------------------------------
1381 */
float32_lt_quiet(float32 a,float32 b)1382 flag float32_lt_quiet( float32 a, float32 b )
1383 {
1384 flag aSign, bSign;
1385
1386 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1387 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1388 ) {
1389 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1390 float_raise( float_flag_invalid );
1391 }
1392 return 0;
1393 }
1394 aSign = extractFloat32Sign( a );
1395 bSign = extractFloat32Sign( b );
1396 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1397 return ( a != b ) && ( aSign ^ ( a < b ) );
1398
1399 }
1400 #endif /* !SOFTFLOAT_FOR_GCC */
1401
1402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1403 /*
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the double-precision floating-point value
1406 `a' to the 32-bit two's complement integer format. The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 Arithmetic---which means in particular that the conversion is rounded
1409 according to the current rounding mode. If `a' is a NaN, the largest
1410 positive integer is returned. Otherwise, if the conversion overflows, the
1411 largest integer with the same sign as `a' is returned.
1412 -------------------------------------------------------------------------------
1413 */
float64_to_int32(float64 a)1414 int32 float64_to_int32( float64 a )
1415 {
1416 flag aSign;
1417 int16 aExp, shiftCount;
1418 bits32 aSig0, aSig1, absZ, aSigExtra;
1419 int32 z;
1420 int8 roundingMode;
1421
1422 aSig1 = extractFloat64Frac1( a );
1423 aSig0 = extractFloat64Frac0( a );
1424 aExp = extractFloat64Exp( a );
1425 aSign = extractFloat64Sign( a );
1426 shiftCount = aExp - 0x413;
1427 if ( 0 <= shiftCount ) {
1428 if ( 0x41E < aExp ) {
1429 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1430 goto invalid;
1431 }
1432 shortShift64Left(
1433 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1434 if ( 0x80000000 < absZ ) goto invalid;
1435 }
1436 else {
1437 aSig1 = ( aSig1 != 0 );
1438 if ( aExp < 0x3FE ) {
1439 aSigExtra = aExp | aSig0 | aSig1;
1440 absZ = 0;
1441 }
1442 else {
1443 aSig0 |= 0x00100000;
1444 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1445 absZ = aSig0>>( - shiftCount );
1446 }
1447 }
1448 roundingMode = float_rounding_mode;
1449 if ( roundingMode == float_round_nearest_even ) {
1450 if ( (sbits32) aSigExtra < 0 ) {
1451 ++absZ;
1452 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1453 }
1454 z = aSign ? - absZ : absZ;
1455 }
1456 else {
1457 aSigExtra = ( aSigExtra != 0 );
1458 if ( aSign ) {
1459 z = - ( absZ
1460 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1461 }
1462 else {
1463 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1464 }
1465 }
1466 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1467 invalid:
1468 float_raise( float_flag_invalid );
1469 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1470 }
1471 if ( aSigExtra ) set_float_exception_inexact_flag();
1472 return z;
1473
1474 }
1475 #endif /* !SOFTFLOAT_FOR_GCC */
1476
1477 /*
1478 -------------------------------------------------------------------------------
1479 Returns the result of converting the double-precision floating-point value
1480 `a' to the 32-bit two's complement integer format. The conversion is
1481 performed according to the IEC/IEEE Standard for Binary Floating-Point
1482 Arithmetic, except that the conversion is always rounded toward zero.
1483 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1484 the conversion overflows, the largest integer with the same sign as `a' is
1485 returned.
1486 -------------------------------------------------------------------------------
1487 */
float64_to_int32_round_to_zero(float64 a)1488 int32 float64_to_int32_round_to_zero( float64 a )
1489 {
1490 flag aSign;
1491 int16 aExp, shiftCount;
1492 bits32 aSig0, aSig1, absZ, aSigExtra;
1493 int32 z;
1494
1495 aSig1 = extractFloat64Frac1( a );
1496 aSig0 = extractFloat64Frac0( a );
1497 aExp = extractFloat64Exp( a );
1498 aSign = extractFloat64Sign( a );
1499 shiftCount = aExp - 0x413;
1500 if ( 0 <= shiftCount ) {
1501 if ( 0x41E < aExp ) {
1502 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1503 goto invalid;
1504 }
1505 shortShift64Left(
1506 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1507 }
1508 else {
1509 if ( aExp < 0x3FF ) {
1510 if ( aExp | aSig0 | aSig1 ) {
1511 set_float_exception_inexact_flag();
1512 }
1513 return 0;
1514 }
1515 aSig0 |= 0x00100000;
1516 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1517 absZ = aSig0>>( - shiftCount );
1518 }
1519 z = aSign ? - absZ : absZ;
1520 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1521 invalid:
1522 float_raise( float_flag_invalid );
1523 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1524 }
1525 if ( aSigExtra ) set_float_exception_inexact_flag();
1526 return z;
1527
1528 }
1529
1530 /*
1531 -------------------------------------------------------------------------------
1532 Returns the result of converting the double-precision floating-point value
1533 `a' to the single-precision floating-point format. The conversion is
1534 performed according to the IEC/IEEE Standard for Binary Floating-Point
1535 Arithmetic.
1536 -------------------------------------------------------------------------------
1537 */
float64_to_float32(float64 a)1538 float32 float64_to_float32( float64 a )
1539 {
1540 flag aSign;
1541 int16 aExp;
1542 bits32 aSig0, aSig1, zSig;
1543 bits32 allZero;
1544
1545 aSig1 = extractFloat64Frac1( a );
1546 aSig0 = extractFloat64Frac0( a );
1547 aExp = extractFloat64Exp( a );
1548 aSign = extractFloat64Sign( a );
1549 if ( aExp == 0x7FF ) {
1550 if ( aSig0 | aSig1 ) {
1551 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1552 }
1553 return packFloat32( aSign, 0xFF, 0 );
1554 }
1555 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1556 if ( aExp ) zSig |= 0x40000000;
1557 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1558
1559 }
1560
1561 #ifndef SOFTFLOAT_FOR_GCC
1562 /*
1563 -------------------------------------------------------------------------------
1564 Rounds the double-precision floating-point value `a' to an integer,
1565 and returns the result as a double-precision floating-point value. The
1566 operation is performed according to the IEC/IEEE Standard for Binary
1567 Floating-Point Arithmetic.
1568 -------------------------------------------------------------------------------
1569 */
float64_round_to_int(float64 a)1570 float64 float64_round_to_int( float64 a )
1571 {
1572 flag aSign;
1573 int16 aExp;
1574 bits32 lastBitMask, roundBitsMask;
1575 int8 roundingMode;
1576 float64 z;
1577
1578 aExp = extractFloat64Exp( a );
1579 if ( 0x413 <= aExp ) {
1580 if ( 0x433 <= aExp ) {
1581 if ( ( aExp == 0x7FF )
1582 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1583 return propagateFloat64NaN( a, a );
1584 }
1585 return a;
1586 }
1587 lastBitMask = 1;
1588 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1589 roundBitsMask = lastBitMask - 1;
1590 z = a;
1591 roundingMode = float_rounding_mode;
1592 if ( roundingMode == float_round_nearest_even ) {
1593 if ( lastBitMask ) {
1594 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1595 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1596 }
1597 else {
1598 if ( (sbits32) z.low < 0 ) {
1599 ++z.high;
1600 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1601 }
1602 }
1603 }
1604 else if ( roundingMode != float_round_to_zero ) {
1605 if ( extractFloat64Sign( z )
1606 ^ ( roundingMode == float_round_up ) ) {
1607 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1608 }
1609 }
1610 z.low &= ~ roundBitsMask;
1611 }
1612 else {
1613 if ( aExp <= 0x3FE ) {
1614 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1615 set_float_exception_inexact_flag();
1616 aSign = extractFloat64Sign( a );
1617 switch ( float_rounding_mode ) {
1618 case float_round_nearest_even:
1619 if ( ( aExp == 0x3FE )
1620 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1621 ) {
1622 return packFloat64( aSign, 0x3FF, 0, 0 );
1623 }
1624 break;
1625 case float_round_down:
1626 return
1627 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1628 : packFloat64( 0, 0, 0, 0 );
1629 case float_round_up:
1630 return
1631 aSign ? packFloat64( 1, 0, 0, 0 )
1632 : packFloat64( 0, 0x3FF, 0, 0 );
1633 }
1634 return packFloat64( aSign, 0, 0, 0 );
1635 }
1636 lastBitMask = 1;
1637 lastBitMask <<= 0x413 - aExp;
1638 roundBitsMask = lastBitMask - 1;
1639 z.low = 0;
1640 z.high = a.high;
1641 roundingMode = float_rounding_mode;
1642 if ( roundingMode == float_round_nearest_even ) {
1643 z.high += lastBitMask>>1;
1644 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1645 z.high &= ~ lastBitMask;
1646 }
1647 }
1648 else if ( roundingMode != float_round_to_zero ) {
1649 if ( extractFloat64Sign( z )
1650 ^ ( roundingMode == float_round_up ) ) {
1651 z.high |= ( a.low != 0 );
1652 z.high += roundBitsMask;
1653 }
1654 }
1655 z.high &= ~ roundBitsMask;
1656 }
1657 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1658 set_float_exception_inexact_flag();
1659 }
1660 return z;
1661
1662 }
1663 #endif
1664
1665 /*
1666 -------------------------------------------------------------------------------
1667 Returns the result of adding the absolute values of the double-precision
1668 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1669 before being returned. `zSign' is ignored if the result is a NaN.
1670 The addition is performed according to the IEC/IEEE Standard for Binary
1671 Floating-Point Arithmetic.
1672 -------------------------------------------------------------------------------
1673 */
addFloat64Sigs(float64 a,float64 b,flag zSign)1674 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1675 {
1676 int16 aExp, bExp, zExp;
1677 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1678 int16 expDiff;
1679
1680 aSig1 = extractFloat64Frac1( a );
1681 aSig0 = extractFloat64Frac0( a );
1682 aExp = extractFloat64Exp( a );
1683 bSig1 = extractFloat64Frac1( b );
1684 bSig0 = extractFloat64Frac0( b );
1685 bExp = extractFloat64Exp( b );
1686 expDiff = aExp - bExp;
1687 if ( 0 < expDiff ) {
1688 if ( aExp == 0x7FF ) {
1689 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1690 return a;
1691 }
1692 if ( bExp == 0 ) {
1693 --expDiff;
1694 }
1695 else {
1696 bSig0 |= 0x00100000;
1697 }
1698 shift64ExtraRightJamming(
1699 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1700 zExp = aExp;
1701 }
1702 else if ( expDiff < 0 ) {
1703 if ( bExp == 0x7FF ) {
1704 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1705 return packFloat64( zSign, 0x7FF, 0, 0 );
1706 }
1707 if ( aExp == 0 ) {
1708 ++expDiff;
1709 }
1710 else {
1711 aSig0 |= 0x00100000;
1712 }
1713 shift64ExtraRightJamming(
1714 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1715 zExp = bExp;
1716 }
1717 else {
1718 if ( aExp == 0x7FF ) {
1719 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1720 return propagateFloat64NaN( a, b );
1721 }
1722 return a;
1723 }
1724 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1725 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1726 zSig2 = 0;
1727 zSig0 |= 0x00200000;
1728 zExp = aExp;
1729 goto shiftRight1;
1730 }
1731 aSig0 |= 0x00100000;
1732 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1733 --zExp;
1734 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1735 ++zExp;
1736 shiftRight1:
1737 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1738 roundAndPack:
1739 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1740
1741 }
1742
1743 /*
1744 -------------------------------------------------------------------------------
1745 Returns the result of subtracting the absolute values of the double-
1746 precision floating-point values `a' and `b'. If `zSign' is 1, the
1747 difference is negated before being returned. `zSign' is ignored if the
1748 result is a NaN. The subtraction is performed according to the IEC/IEEE
1749 Standard for Binary Floating-Point Arithmetic.
1750 -------------------------------------------------------------------------------
1751 */
subFloat64Sigs(float64 a,float64 b,flag zSign)1752 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1753 {
1754 int16 aExp, bExp, zExp;
1755 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1756 int16 expDiff;
1757
1758 aSig1 = extractFloat64Frac1( a );
1759 aSig0 = extractFloat64Frac0( a );
1760 aExp = extractFloat64Exp( a );
1761 bSig1 = extractFloat64Frac1( b );
1762 bSig0 = extractFloat64Frac0( b );
1763 bExp = extractFloat64Exp( b );
1764 expDiff = aExp - bExp;
1765 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1766 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1767 if ( 0 < expDiff ) goto aExpBigger;
1768 if ( expDiff < 0 ) goto bExpBigger;
1769 if ( aExp == 0x7FF ) {
1770 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1771 return propagateFloat64NaN( a, b );
1772 }
1773 float_raise( float_flag_invalid );
1774 return float64_default_nan;
1775 }
1776 if ( aExp == 0 ) {
1777 aExp = 1;
1778 bExp = 1;
1779 }
1780 if ( bSig0 < aSig0 ) goto aBigger;
1781 if ( aSig0 < bSig0 ) goto bBigger;
1782 if ( bSig1 < aSig1 ) goto aBigger;
1783 if ( aSig1 < bSig1 ) goto bBigger;
1784 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1785 bExpBigger:
1786 if ( bExp == 0x7FF ) {
1787 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1788 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1789 }
1790 if ( aExp == 0 ) {
1791 ++expDiff;
1792 }
1793 else {
1794 aSig0 |= 0x40000000;
1795 }
1796 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1797 bSig0 |= 0x40000000;
1798 bBigger:
1799 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1800 zExp = bExp;
1801 zSign ^= 1;
1802 goto normalizeRoundAndPack;
1803 aExpBigger:
1804 if ( aExp == 0x7FF ) {
1805 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1806 return a;
1807 }
1808 if ( bExp == 0 ) {
1809 --expDiff;
1810 }
1811 else {
1812 bSig0 |= 0x40000000;
1813 }
1814 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1815 aSig0 |= 0x40000000;
1816 aBigger:
1817 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1818 zExp = aExp;
1819 normalizeRoundAndPack:
1820 --zExp;
1821 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1822
1823 }
1824
1825 /*
1826 -------------------------------------------------------------------------------
1827 Returns the result of adding the double-precision floating-point values `a'
1828 and `b'. The operation is performed according to the IEC/IEEE Standard for
1829 Binary Floating-Point Arithmetic.
1830 -------------------------------------------------------------------------------
1831 */
float64_add(float64 a,float64 b)1832 float64 float64_add( float64 a, float64 b )
1833 {
1834 flag aSign, bSign;
1835
1836 aSign = extractFloat64Sign( a );
1837 bSign = extractFloat64Sign( b );
1838 if ( aSign == bSign ) {
1839 return addFloat64Sigs( a, b, aSign );
1840 }
1841 else {
1842 return subFloat64Sigs( a, b, aSign );
1843 }
1844
1845 }
1846
1847 /*
1848 -------------------------------------------------------------------------------
1849 Returns the result of subtracting the double-precision floating-point values
1850 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1851 for Binary Floating-Point Arithmetic.
1852 -------------------------------------------------------------------------------
1853 */
float64_sub(float64 a,float64 b)1854 float64 float64_sub( float64 a, float64 b )
1855 {
1856 flag aSign, bSign;
1857
1858 aSign = extractFloat64Sign( a );
1859 bSign = extractFloat64Sign( b );
1860 if ( aSign == bSign ) {
1861 return subFloat64Sigs( a, b, aSign );
1862 }
1863 else {
1864 return addFloat64Sigs( a, b, aSign );
1865 }
1866
1867 }
1868
1869 /*
1870 -------------------------------------------------------------------------------
1871 Returns the result of multiplying the double-precision floating-point values
1872 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1873 for Binary Floating-Point Arithmetic.
1874 -------------------------------------------------------------------------------
1875 */
float64_mul(float64 a,float64 b)1876 float64 float64_mul( float64 a, float64 b )
1877 {
1878 flag aSign, bSign, zSign;
1879 int16 aExp, bExp, zExp;
1880 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1881
1882 aSig1 = extractFloat64Frac1( a );
1883 aSig0 = extractFloat64Frac0( a );
1884 aExp = extractFloat64Exp( a );
1885 aSign = extractFloat64Sign( a );
1886 bSig1 = extractFloat64Frac1( b );
1887 bSig0 = extractFloat64Frac0( b );
1888 bExp = extractFloat64Exp( b );
1889 bSign = extractFloat64Sign( b );
1890 zSign = aSign ^ bSign;
1891 if ( aExp == 0x7FF ) {
1892 if ( ( aSig0 | aSig1 )
1893 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1894 return propagateFloat64NaN( a, b );
1895 }
1896 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1897 return packFloat64( zSign, 0x7FF, 0, 0 );
1898 }
1899 if ( bExp == 0x7FF ) {
1900 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1901 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1902 invalid:
1903 float_raise( float_flag_invalid );
1904 return float64_default_nan;
1905 }
1906 return packFloat64( zSign, 0x7FF, 0, 0 );
1907 }
1908 if ( aExp == 0 ) {
1909 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1910 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1911 }
1912 if ( bExp == 0 ) {
1913 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1914 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1915 }
1916 zExp = aExp + bExp - 0x400;
1917 aSig0 |= 0x00100000;
1918 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1919 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1920 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1921 zSig2 |= ( zSig3 != 0 );
1922 if ( 0x00200000 <= zSig0 ) {
1923 shift64ExtraRightJamming(
1924 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1925 ++zExp;
1926 }
1927 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1928
1929 }
1930
1931 /*
1932 -------------------------------------------------------------------------------
1933 Returns the result of dividing the double-precision floating-point value `a'
1934 by the corresponding value `b'. The operation is performed according to the
1935 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1936 -------------------------------------------------------------------------------
1937 */
float64_div(float64 a,float64 b)1938 float64 float64_div( float64 a, float64 b )
1939 {
1940 flag aSign, bSign, zSign;
1941 int16 aExp, bExp, zExp;
1942 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1943 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1944
1945 aSig1 = extractFloat64Frac1( a );
1946 aSig0 = extractFloat64Frac0( a );
1947 aExp = extractFloat64Exp( a );
1948 aSign = extractFloat64Sign( a );
1949 bSig1 = extractFloat64Frac1( b );
1950 bSig0 = extractFloat64Frac0( b );
1951 bExp = extractFloat64Exp( b );
1952 bSign = extractFloat64Sign( b );
1953 zSign = aSign ^ bSign;
1954 if ( aExp == 0x7FF ) {
1955 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1956 if ( bExp == 0x7FF ) {
1957 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1958 goto invalid;
1959 }
1960 return packFloat64( zSign, 0x7FF, 0, 0 );
1961 }
1962 if ( bExp == 0x7FF ) {
1963 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1964 return packFloat64( zSign, 0, 0, 0 );
1965 }
1966 if ( bExp == 0 ) {
1967 if ( ( bSig0 | bSig1 ) == 0 ) {
1968 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1969 invalid:
1970 float_raise( float_flag_invalid );
1971 return float64_default_nan;
1972 }
1973 float_raise( float_flag_divbyzero );
1974 return packFloat64( zSign, 0x7FF, 0, 0 );
1975 }
1976 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1977 }
1978 if ( aExp == 0 ) {
1979 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1980 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1981 }
1982 zExp = aExp - bExp + 0x3FD;
1983 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1984 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1985 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1986 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1987 ++zExp;
1988 }
1989 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1990 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1991 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1992 while ( (sbits32) rem0 < 0 ) {
1993 --zSig0;
1994 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1995 }
1996 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1997 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1998 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1999 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
2000 while ( (sbits32) rem1 < 0 ) {
2001 --zSig1;
2002 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
2003 }
2004 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2005 }
2006 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2007 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2008
2009 }
2010
2011 #ifndef SOFTFLOAT_FOR_GCC
2012 /*
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the double-precision floating-point value `a'
2015 with respect to the corresponding value `b'. The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2018 */
float64_rem(float64 a,float64 b)2019 float64 float64_rem( float64 a, float64 b )
2020 {
2021 flag aSign, bSign, zSign;
2022 int16 aExp, bExp, expDiff;
2023 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2024 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2025 sbits32 sigMean0;
2026 float64 z;
2027
2028 aSig1 = extractFloat64Frac1( a );
2029 aSig0 = extractFloat64Frac0( a );
2030 aExp = extractFloat64Exp( a );
2031 aSign = extractFloat64Sign( a );
2032 bSig1 = extractFloat64Frac1( b );
2033 bSig0 = extractFloat64Frac0( b );
2034 bExp = extractFloat64Exp( b );
2035 bSign = extractFloat64Sign( b );
2036 if ( aExp == 0x7FF ) {
2037 if ( ( aSig0 | aSig1 )
2038 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2039 return propagateFloat64NaN( a, b );
2040 }
2041 goto invalid;
2042 }
2043 if ( bExp == 0x7FF ) {
2044 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2045 return a;
2046 }
2047 if ( bExp == 0 ) {
2048 if ( ( bSig0 | bSig1 ) == 0 ) {
2049 invalid:
2050 float_raise( float_flag_invalid );
2051 return float64_default_nan;
2052 }
2053 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2054 }
2055 if ( aExp == 0 ) {
2056 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2057 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2058 }
2059 expDiff = aExp - bExp;
2060 if ( expDiff < -1 ) return a;
2061 shortShift64Left(
2062 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2063 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2064 q = le64( bSig0, bSig1, aSig0, aSig1 );
2065 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2066 expDiff -= 32;
2067 while ( 0 < expDiff ) {
2068 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2069 q = ( 4 < q ) ? q - 4 : 0;
2070 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2071 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2072 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2073 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2074 expDiff -= 29;
2075 }
2076 if ( -32 < expDiff ) {
2077 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2078 q = ( 4 < q ) ? q - 4 : 0;
2079 q >>= - expDiff;
2080 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2081 expDiff += 24;
2082 if ( expDiff < 0 ) {
2083 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2084 }
2085 else {
2086 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2087 }
2088 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2089 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2090 }
2091 else {
2092 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2093 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2094 }
2095 do {
2096 alternateASig0 = aSig0;
2097 alternateASig1 = aSig1;
2098 ++q;
2099 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2100 } while ( 0 <= (sbits32) aSig0 );
2101 add64(
2102 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2103 if ( ( sigMean0 < 0 )
2104 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2105 aSig0 = alternateASig0;
2106 aSig1 = alternateASig1;
2107 }
2108 zSign = ( (sbits32) aSig0 < 0 );
2109 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2110 return
2111 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2112
2113 }
2114 #endif
2115
2116 #ifndef SOFTFLOAT_FOR_GCC
2117 /*
2118 -------------------------------------------------------------------------------
2119 Returns the square root of the double-precision floating-point value `a'.
2120 The operation is performed according to the IEC/IEEE Standard for Binary
2121 Floating-Point Arithmetic.
2122 -------------------------------------------------------------------------------
2123 */
float64_sqrt(float64 a)2124 float64 float64_sqrt( float64 a )
2125 {
2126 flag aSign;
2127 int16 aExp, zExp;
2128 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2129 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2130 float64 z;
2131
2132 aSig1 = extractFloat64Frac1( a );
2133 aSig0 = extractFloat64Frac0( a );
2134 aExp = extractFloat64Exp( a );
2135 aSign = extractFloat64Sign( a );
2136 if ( aExp == 0x7FF ) {
2137 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2138 if ( ! aSign ) return a;
2139 goto invalid;
2140 }
2141 if ( aSign ) {
2142 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2143 invalid:
2144 float_raise( float_flag_invalid );
2145 return float64_default_nan;
2146 }
2147 if ( aExp == 0 ) {
2148 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2149 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2150 }
2151 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2152 aSig0 |= 0x00100000;
2153 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2154 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2155 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2156 doubleZSig0 = zSig0 + zSig0;
2157 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2158 mul32To64( zSig0, zSig0, &term0, &term1 );
2159 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2160 while ( (sbits32) rem0 < 0 ) {
2161 --zSig0;
2162 doubleZSig0 -= 2;
2163 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2164 }
2165 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2166 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2167 if ( zSig1 == 0 ) zSig1 = 1;
2168 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2169 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2170 mul32To64( zSig1, zSig1, &term2, &term3 );
2171 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2172 while ( (sbits32) rem1 < 0 ) {
2173 --zSig1;
2174 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2175 term3 |= 1;
2176 term2 |= doubleZSig0;
2177 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2178 }
2179 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2180 }
2181 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2182 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2183
2184 }
2185 #endif
2186
2187 /*
2188 -------------------------------------------------------------------------------
2189 Returns 1 if the double-precision floating-point value `a' is equal to
2190 the corresponding value `b', and 0 otherwise. The comparison is performed
2191 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2192 -------------------------------------------------------------------------------
2193 */
float64_eq(float64 a,float64 b)2194 flag float64_eq( float64 a, float64 b )
2195 {
2196
2197 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2198 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2199 || ( ( extractFloat64Exp( b ) == 0x7FF )
2200 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2201 ) {
2202 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2203 float_raise( float_flag_invalid );
2204 }
2205 return 0;
2206 }
2207 return ( a == b ) ||
2208 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2209
2210 }
2211
2212 /*
2213 -------------------------------------------------------------------------------
2214 Returns 1 if the double-precision floating-point value `a' is less than
2215 or equal to the corresponding value `b', and 0 otherwise. The comparison
2216 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2217 Arithmetic.
2218 -------------------------------------------------------------------------------
2219 */
float64_le(float64 a,float64 b)2220 flag float64_le( float64 a, float64 b )
2221 {
2222 flag aSign, bSign;
2223
2224 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2225 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2226 || ( ( extractFloat64Exp( b ) == 0x7FF )
2227 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2228 ) {
2229 float_raise( float_flag_invalid );
2230 return 0;
2231 }
2232 aSign = extractFloat64Sign( a );
2233 bSign = extractFloat64Sign( b );
2234 if ( aSign != bSign )
2235 return aSign ||
2236 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2237 0 );
2238 return ( a == b ) ||
2239 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2240 }
2241
2242 /*
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the double-precision floating-point value `a' is less than
2245 the corresponding value `b', and 0 otherwise. The comparison is performed
2246 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2247 -------------------------------------------------------------------------------
2248 */
float64_lt(float64 a,float64 b)2249 flag float64_lt( float64 a, float64 b )
2250 {
2251 flag aSign, bSign;
2252
2253 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2254 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2255 || ( ( extractFloat64Exp( b ) == 0x7FF )
2256 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2257 ) {
2258 float_raise( float_flag_invalid );
2259 return 0;
2260 }
2261 aSign = extractFloat64Sign( a );
2262 bSign = extractFloat64Sign( b );
2263 if ( aSign != bSign )
2264 return aSign &&
2265 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2266 0 );
2267 return ( a != b ) &&
2268 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2269
2270 }
2271
2272 #ifndef SOFTFLOAT_FOR_GCC
2273 /*
2274 -------------------------------------------------------------------------------
2275 Returns 1 if the double-precision floating-point value `a' is equal to
2276 the corresponding value `b', and 0 otherwise. The invalid exception is
2277 raised if either operand is a NaN. Otherwise, the comparison is performed
2278 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2279 -------------------------------------------------------------------------------
2280 */
float64_eq_signaling(float64 a,float64 b)2281 flag float64_eq_signaling( float64 a, float64 b )
2282 {
2283
2284 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2285 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2286 || ( ( extractFloat64Exp( b ) == 0x7FF )
2287 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2288 ) {
2289 float_raise( float_flag_invalid );
2290 return 0;
2291 }
2292 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2293
2294 }
2295
2296 /*
2297 -------------------------------------------------------------------------------
2298 Returns 1 if the double-precision floating-point value `a' is less than or
2299 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2300 cause an exception. Otherwise, the comparison is performed according to the
2301 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2302 -------------------------------------------------------------------------------
2303 */
float64_le_quiet(float64 a,float64 b)2304 flag float64_le_quiet( float64 a, float64 b )
2305 {
2306 flag aSign, bSign;
2307
2308 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2309 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2310 || ( ( extractFloat64Exp( b ) == 0x7FF )
2311 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2312 ) {
2313 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2314 float_raise( float_flag_invalid );
2315 }
2316 return 0;
2317 }
2318 aSign = extractFloat64Sign( a );
2319 bSign = extractFloat64Sign( b );
2320 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2321 return ( a == b ) || ( aSign ^ ( a < b ) );
2322
2323 }
2324
2325 /*
2326 -------------------------------------------------------------------------------
2327 Returns 1 if the double-precision floating-point value `a' is less than
2328 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2329 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2330 Standard for Binary Floating-Point Arithmetic.
2331 -------------------------------------------------------------------------------
2332 */
float64_lt_quiet(float64 a,float64 b)2333 flag float64_lt_quiet( float64 a, float64 b )
2334 {
2335 flag aSign, bSign;
2336
2337 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2338 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2339 || ( ( extractFloat64Exp( b ) == 0x7FF )
2340 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2341 ) {
2342 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2343 float_raise( float_flag_invalid );
2344 }
2345 return 0;
2346 }
2347 aSign = extractFloat64Sign( a );
2348 bSign = extractFloat64Sign( b );
2349 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2350 return ( a != b ) && ( aSign ^ ( a < b ) );
2351
2352 }
2353
2354 #endif
2355