1 /*
2  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 
26 package sun.misc;
27 
28 import sun.misc.FloatConsts;
29 import sun.misc.DoubleConsts;
30 
31 /**
32  * The class {@code FpUtils} contains static utility methods for
33  * manipulating and inspecting {@code float} and
34  * {@code double} floating-point numbers.  These methods include
35  * functionality recommended or required by the IEEE 754
36  * floating-point standard.
37  *
38  * @author Joseph D. Darcy
39  */
40 
41 public class FpUtils {
42     /*
43      * The methods in this class are reasonably implemented using
44      * direct or indirect bit-level manipulation of floating-point
45      * values.  However, having access to the IEEE 754 recommended
46      * functions would obviate the need for most programmers to engage
47      * in floating-point bit-twiddling.
48      *
49      * An IEEE 754 number has three fields, from most significant bit
50      * to to least significant, sign, exponent, and significand.
51      *
52      *  msb                                lsb
53      * [sign|exponent|  fractional_significand]
54      *
55      * Using some encoding cleverness, explained below, the high order
56      * bit of the logical significand does not need to be explicitly
57      * stored, thus "fractional_significand" instead of simply
58      * "significand" in the figure above.
59      *
60      * For finite normal numbers, the numerical value encoded is
61      *
62      * (-1)^sign * 2^(exponent)*(1.fractional_significand)
63      *
64      * Most finite floating-point numbers are normalized; the exponent
65      * value is reduced until the leading significand bit is 1.
66      * Therefore, the leading 1 is redundant and is not explicitly
67      * stored.  If a numerical value is so small it cannot be
68      * normalized, it has a subnormal representation. Subnormal
69      * numbers don't have a leading 1 in their significand; subnormals
70      * are encoding using a special exponent value.  In other words,
71      * the high-order bit of the logical significand can be elided in
72      * from the representation in either case since the bit's value is
73      * implicit from the exponent value.
74      *
75      * The exponent field uses a biased representation; if the bits of
76      * the exponent are interpreted as a unsigned integer E, the
77      * exponent represented is E - E_bias where E_bias depends on the
78      * floating-point format.  E can range between E_min and E_max,
79      * constants which depend on the floating-point format.  E_min and
80      * E_max are -126 and +127 for float, -1022 and +1023 for double.
81      *
82      * The 32-bit float format has 1 sign bit, 8 exponent bits, and 23
83      * bits for the significand (which is logically 24 bits wide
84      * because of the implicit bit).  The 64-bit double format has 1
85      * sign bit, 11 exponent bits, and 52 bits for the significand
86      * (logically 53 bits).
87      *
88      * Subnormal numbers and zero have the special exponent value
89      * E_min -1; the numerical value represented by a subnormal is:
90      *
91      * (-1)^sign * 2^(E_min)*(0.fractional_significand)
92      *
93      * Zero is represented by all zero bits in the exponent and all
94      * zero bits in the significand; zero can have either sign.
95      *
96      * Infinity and NaN are encoded using the exponent value E_max +
97      * 1.  Signed infinities have all significand bits zero; NaNs have
98      * at least one non-zero significand bit.
99      *
100      * The details of IEEE 754 floating-point encoding will be used in
101      * the methods below without further comment.  For further
102      * exposition on IEEE 754 numbers, see "IEEE Standard for Binary
103      * Floating-Point Arithmetic" ANSI/IEEE Std 754-1985 or William
104      * Kahan's "Lecture Notes on the Status of IEEE Standard 754 for
105      * Binary Floating-Point Arithmetic",
106      * http://www.cs.berkeley.edu/~wkahan/ieee754status/ieee754.ps.
107      *
108      * Many of this class's methods are members of the set of IEEE 754
109      * recommended functions or similar functions recommended or
110      * required by IEEE 754R.  Discussion of various implementation
111      * techniques for these functions have occurred in:
112      *
113      * W.J. Cody and Jerome T. Coonen, "Algorithm 772 Functions to
114      * Support the IEEE Standard for Binary Floating-Point
115      * Arithmetic," ACM Transactions on Mathematical Software,
116      * vol. 19, no. 4, December 1993, pp. 443-451.
117      *
118      * Joseph D. Darcy, "Writing robust IEEE recommended functions in
119      * ``100% Pure Java''(TM)," University of California, Berkeley
120      * technical report UCB//CSD-98-1009.
121      */
122 
123     /**
124      * Don't let anyone instantiate this class.
125      */
FpUtils()126     private FpUtils() {}
127 
128     // Constants used in scalb
129     static double twoToTheDoubleScaleUp = powerOfTwoD(512);
130     static double twoToTheDoubleScaleDown = powerOfTwoD(-512);
131 
132     // Helper Methods
133 
134     // The following helper methods are used in the implementation of
135     // the public recommended functions; they generally omit certain
136     // tests for exception cases.
137 
138     /**
139      * Returns unbiased exponent of a {@code double}.
140      */
getExponent(double d)141     public static int getExponent(double d){
142         /*
143          * Bitwise convert d to long, mask out exponent bits, shift
144          * to the right and then subtract out double's bias adjust to
145          * get true exponent value.
146          */
147         return (int)(((Double.doubleToRawLongBits(d) & DoubleConsts.EXP_BIT_MASK) >>
148                       (DoubleConsts.SIGNIFICAND_WIDTH - 1)) - DoubleConsts.EXP_BIAS);
149     }
150 
151     /**
152      * Returns unbiased exponent of a {@code float}.
153      */
getExponent(float f)154     public static int getExponent(float f){
155         /*
156          * Bitwise convert f to integer, mask out exponent bits, shift
157          * to the right and then subtract out float's bias adjust to
158          * get true exponent value
159          */
160         return ((Float.floatToRawIntBits(f) & FloatConsts.EXP_BIT_MASK) >>
161                 (FloatConsts.SIGNIFICAND_WIDTH - 1)) - FloatConsts.EXP_BIAS;
162     }
163 
164     /**
165      * Returns a floating-point power of two in the normal range.
166      */
powerOfTwoD(int n)167     static double powerOfTwoD(int n) {
168         assert(n >= DoubleConsts.MIN_EXPONENT && n <= DoubleConsts.MAX_EXPONENT);
169         return Double.longBitsToDouble((((long)n + (long)DoubleConsts.EXP_BIAS) <<
170                                         (DoubleConsts.SIGNIFICAND_WIDTH-1))
171                                        & DoubleConsts.EXP_BIT_MASK);
172     }
173 
174     /**
175      * Returns a floating-point power of two in the normal range.
176      */
powerOfTwoF(int n)177     static float powerOfTwoF(int n) {
178         assert(n >= FloatConsts.MIN_EXPONENT && n <= FloatConsts.MAX_EXPONENT);
179         return Float.intBitsToFloat(((n + FloatConsts.EXP_BIAS) <<
180                                      (FloatConsts.SIGNIFICAND_WIDTH-1))
181                                     & FloatConsts.EXP_BIT_MASK);
182     }
183 
184     /**
185      * Returns the first floating-point argument with the sign of the
186      * second floating-point argument.  Note that unlike the {@link
187      * FpUtils#copySign(double, double) copySign} method, this method
188      * does not require NaN {@code sign} arguments to be treated
189      * as positive values; implementations are permitted to treat some
190      * NaN arguments as positive and other NaN arguments as negative
191      * to allow greater performance.
192      *
193      * @param magnitude  the parameter providing the magnitude of the result
194      * @param sign   the parameter providing the sign of the result
195      * @return a value with the magnitude of {@code magnitude}
196      * and the sign of {@code sign}.
197      * @author Joseph D. Darcy
198      */
rawCopySign(double magnitude, double sign)199     public static double rawCopySign(double magnitude, double sign) {
200         return Double.longBitsToDouble((Double.doubleToRawLongBits(sign) &
201                                         (DoubleConsts.SIGN_BIT_MASK)) |
202                                        (Double.doubleToRawLongBits(magnitude) &
203                                         (DoubleConsts.EXP_BIT_MASK |
204                                          DoubleConsts.SIGNIF_BIT_MASK)));
205     }
206 
207     /**
208      * Returns the first floating-point argument with the sign of the
209      * second floating-point argument.  Note that unlike the {@link
210      * FpUtils#copySign(float, float) copySign} method, this method
211      * does not require NaN {@code sign} arguments to be treated
212      * as positive values; implementations are permitted to treat some
213      * NaN arguments as positive and other NaN arguments as negative
214      * to allow greater performance.
215      *
216      * @param magnitude  the parameter providing the magnitude of the result
217      * @param sign   the parameter providing the sign of the result
218      * @return a value with the magnitude of {@code magnitude}
219      * and the sign of {@code sign}.
220      * @author Joseph D. Darcy
221      */
rawCopySign(float magnitude, float sign)222     public static float rawCopySign(float magnitude, float sign) {
223         return Float.intBitsToFloat((Float.floatToRawIntBits(sign) &
224                                      (FloatConsts.SIGN_BIT_MASK)) |
225                                     (Float.floatToRawIntBits(magnitude) &
226                                      (FloatConsts.EXP_BIT_MASK |
227                                       FloatConsts.SIGNIF_BIT_MASK)));
228     }
229 
230     /* ***************************************************************** */
231 
232     /**
233      * Returns {@code true} if the argument is a finite
234      * floating-point value; returns {@code false} otherwise (for
235      * NaN and infinity arguments).
236      *
237      * @param d the {@code double} value to be tested
238      * @return {@code true} if the argument is a finite
239      * floating-point value, {@code false} otherwise.
240      */
isFinite(double d)241     public static boolean isFinite(double d) {
242         return Math.abs(d) <= DoubleConsts.MAX_VALUE;
243     }
244 
245     /**
246      * Returns {@code true} if the argument is a finite
247      * floating-point value; returns {@code false} otherwise (for
248      * NaN and infinity arguments).
249      *
250      * @param f the {@code float} value to be tested
251      * @return {@code true} if the argument is a finite
252      * floating-point value, {@code false} otherwise.
253      */
isFinite(float f)254      public static boolean isFinite(float f) {
255         return Math.abs(f) <= FloatConsts.MAX_VALUE;
256     }
257 
258     /**
259      * Returns {@code true} if the specified number is infinitely
260      * large in magnitude, {@code false} otherwise.
261      *
262      * <p>Note that this method is equivalent to the {@link
263      * Double#isInfinite(double) Double.isInfinite} method; the
264      * functionality is included in this class for convenience.
265      *
266      * @param   d   the value to be tested.
267      * @return  {@code true} if the value of the argument is positive
268      *          infinity or negative infinity; {@code false} otherwise.
269      */
isInfinite(double d)270     public static boolean isInfinite(double d) {
271         return Double.isInfinite(d);
272     }
273 
274     /**
275      * Returns {@code true} if the specified number is infinitely
276      * large in magnitude, {@code false} otherwise.
277      *
278      * <p>Note that this method is equivalent to the {@link
279      * Float#isInfinite(float) Float.isInfinite} method; the
280      * functionality is included in this class for convenience.
281      *
282      * @param   f   the value to be tested.
283      * @return  {@code true} if the argument is positive infinity or
284      *          negative infinity; {@code false} otherwise.
285      */
isInfinite(float f)286      public static boolean isInfinite(float f) {
287          return Float.isInfinite(f);
288     }
289 
290     /**
291      * Returns {@code true} if the specified number is a
292      * Not-a-Number (NaN) value, {@code false} otherwise.
293      *
294      * <p>Note that this method is equivalent to the {@link
295      * Double#isNaN(double) Double.isNaN} method; the functionality is
296      * included in this class for convenience.
297      *
298      * @param   d   the value to be tested.
299      * @return  {@code true} if the value of the argument is NaN;
300      *          {@code false} otherwise.
301      */
isNaN(double d)302     public static boolean isNaN(double d) {
303         return Double.isNaN(d);
304     }
305 
306     /**
307      * Returns {@code true} if the specified number is a
308      * Not-a-Number (NaN) value, {@code false} otherwise.
309      *
310      * <p>Note that this method is equivalent to the {@link
311      * Float#isNaN(float) Float.isNaN} method; the functionality is
312      * included in this class for convenience.
313      *
314      * @param   f   the value to be tested.
315      * @return  {@code true} if the argument is NaN;
316      *          {@code false} otherwise.
317      */
isNaN(float f)318      public static boolean isNaN(float f) {
319         return Float.isNaN(f);
320     }
321 
322     /**
323      * Returns {@code true} if the unordered relation holds
324      * between the two arguments.  When two floating-point values are
325      * unordered, one value is neither less than, equal to, nor
326      * greater than the other.  For the unordered relation to be true,
327      * at least one argument must be a {@code NaN}.
328      *
329      * @param arg1      the first argument
330      * @param arg2      the second argument
331      * @return {@code true} if at least one argument is a NaN,
332      * {@code false} otherwise.
333      */
isUnordered(double arg1, double arg2)334     public static boolean isUnordered(double arg1, double arg2) {
335         return isNaN(arg1) || isNaN(arg2);
336     }
337 
338     /**
339      * Returns {@code true} if the unordered relation holds
340      * between the two arguments.  When two floating-point values are
341      * unordered, one value is neither less than, equal to, nor
342      * greater than the other.  For the unordered relation to be true,
343      * at least one argument must be a {@code NaN}.
344      *
345      * @param arg1      the first argument
346      * @param arg2      the second argument
347      * @return {@code true} if at least one argument is a NaN,
348      * {@code false} otherwise.
349      */
isUnordered(float arg1, float arg2)350      public static boolean isUnordered(float arg1, float arg2) {
351         return isNaN(arg1) || isNaN(arg2);
352     }
353 
354     /**
355      * Returns unbiased exponent of a {@code double}; for
356      * subnormal values, the number is treated as if it were
357      * normalized.  That is for all finite, non-zero, positive numbers
358      * <i>x</i>, <code>scalb(<i>x</i>, -ilogb(<i>x</i>))</code> is
359      * always in the range [1, 2).
360      * <p>
361      * Special cases:
362      * <ul>
363      * <li> If the argument is NaN, then the result is 2<sup>30</sup>.
364      * <li> If the argument is infinite, then the result is 2<sup>28</sup>.
365      * <li> If the argument is zero, then the result is -(2<sup>28</sup>).
366      * </ul>
367      *
368      * @param d floating-point number whose exponent is to be extracted
369      * @return unbiased exponent of the argument.
370      * @author Joseph D. Darcy
371      */
ilogb(double d)372     public static int ilogb(double d) {
373         int exponent = getExponent(d);
374 
375         switch (exponent) {
376         case DoubleConsts.MAX_EXPONENT+1:       // NaN or infinity
377             if( isNaN(d) )
378                 return (1<<30);         // 2^30
379             else // infinite value
380                 return (1<<28);         // 2^28
381 
382         case DoubleConsts.MIN_EXPONENT-1:       // zero or subnormal
383             if(d == 0.0) {
384                 return -(1<<28);        // -(2^28)
385             }
386             else {
387                 long transducer = Double.doubleToRawLongBits(d);
388 
389                 /*
390                  * To avoid causing slow arithmetic on subnormals,
391                  * the scaling to determine when d's significand
392                  * is normalized is done in integer arithmetic.
393                  * (there must be at least one "1" bit in the
394                  * significand since zero has been screened out.
395                  */
396 
397                 // isolate significand bits
398                 transducer &= DoubleConsts.SIGNIF_BIT_MASK;
399                 assert(transducer != 0L);
400 
401                 // This loop is simple and functional. We might be
402                 // able to do something more clever that was faster;
403                 // e.g. number of leading zero detection on
404                 // (transducer << (# exponent and sign bits).
405                 while (transducer <
406                        (1L << (DoubleConsts.SIGNIFICAND_WIDTH - 1))) {
407                     transducer *= 2;
408                     exponent--;
409                 }
410                 exponent++;
411                 assert( exponent >=
412                         DoubleConsts.MIN_EXPONENT - (DoubleConsts.SIGNIFICAND_WIDTH-1) &&
413                         exponent < DoubleConsts.MIN_EXPONENT);
414                 return exponent;
415             }
416 
417         default:
418             assert( exponent >= DoubleConsts.MIN_EXPONENT &&
419                     exponent <= DoubleConsts.MAX_EXPONENT);
420             return exponent;
421         }
422     }
423 
424     /**
425      * Returns unbiased exponent of a {@code float}; for
426      * subnormal values, the number is treated as if it were
427      * normalized.  That is for all finite, non-zero, positive numbers
428      * <i>x</i>, <code>scalb(<i>x</i>, -ilogb(<i>x</i>))</code> is
429      * always in the range [1, 2).
430      * <p>
431      * Special cases:
432      * <ul>
433      * <li> If the argument is NaN, then the result is 2<sup>30</sup>.
434      * <li> If the argument is infinite, then the result is 2<sup>28</sup>.
435      * <li> If the argument is zero, then the result is -(2<sup>28</sup>).
436      * </ul>
437      *
438      * @param f floating-point number whose exponent is to be extracted
439      * @return unbiased exponent of the argument.
440      * @author Joseph D. Darcy
441      */
ilogb(float f)442      public static int ilogb(float f) {
443         int exponent = getExponent(f);
444 
445         switch (exponent) {
446         case FloatConsts.MAX_EXPONENT+1:        // NaN or infinity
447             if( isNaN(f) )
448                 return (1<<30);         // 2^30
449             else // infinite value
450                 return (1<<28);         // 2^28
451 
452         case FloatConsts.MIN_EXPONENT-1:        // zero or subnormal
453             if(f == 0.0f) {
454                 return -(1<<28);        // -(2^28)
455             }
456             else {
457                 int transducer = Float.floatToRawIntBits(f);
458 
459                 /*
460                  * To avoid causing slow arithmetic on subnormals,
461                  * the scaling to determine when f's significand
462                  * is normalized is done in integer arithmetic.
463                  * (there must be at least one "1" bit in the
464                  * significand since zero has been screened out.
465                  */
466 
467                 // isolate significand bits
468                 transducer &= FloatConsts.SIGNIF_BIT_MASK;
469                 assert(transducer != 0);
470 
471                 // This loop is simple and functional. We might be
472                 // able to do something more clever that was faster;
473                 // e.g. number of leading zero detection on
474                 // (transducer << (# exponent and sign bits).
475                 while (transducer <
476                        (1 << (FloatConsts.SIGNIFICAND_WIDTH - 1))) {
477                     transducer *= 2;
478                     exponent--;
479                 }
480                 exponent++;
481                 assert( exponent >=
482                         FloatConsts.MIN_EXPONENT - (FloatConsts.SIGNIFICAND_WIDTH-1) &&
483                         exponent < FloatConsts.MIN_EXPONENT);
484                 return exponent;
485             }
486 
487         default:
488             assert( exponent >= FloatConsts.MIN_EXPONENT &&
489                     exponent <= FloatConsts.MAX_EXPONENT);
490             return exponent;
491         }
492     }
493 
494 
495     /*
496      * The scalb operation should be reasonably fast; however, there
497      * are tradeoffs in writing a method to minimize the worst case
498      * performance and writing a method to minimize the time for
499      * expected common inputs.  Some processors operate very slowly on
500      * subnormal operands, taking hundreds or thousands of cycles for
501      * one floating-point add or multiply as opposed to, say, four
502      * cycles for normal operands.  For processors with very slow
503      * subnormal execution, scalb would be fastest if written entirely
504      * with integer operations; in other words, scalb would need to
505      * include the logic of performing correct rounding of subnormal
506      * values.  This could be reasonably done in at most a few hundred
507      * cycles.  However, this approach may penalize normal operations
508      * since at least the exponent of the floating-point argument must
509      * be examined.
510      *
511      * The approach taken in this implementation is a compromise.
512      * Floating-point multiplication is used to do most of the work;
513      * but knowingly multiplying by a subnormal scaling factor is
514      * avoided.  However, the floating-point argument is not examined
515      * to see whether or not it is subnormal since subnormal inputs
516      * are assumed to be rare.  At most three multiplies are needed to
517      * scale from the largest to smallest exponent ranges (scaling
518      * down, at most two multiplies are needed if subnormal scaling
519      * factors are allowed).  However, in this implementation an
520      * expensive integer remainder operation is avoided at the cost of
521      * requiring five floating-point multiplies in the worst case,
522      * which should still be a performance win.
523      *
524      * If scaling of entire arrays is a concern, it would probably be
525      * more efficient to provide a double[] scalb(double[], int)
526      * version of scalb to avoid having to recompute the needed
527      * scaling factors for each floating-point value.
528      */
529 
530     /**
531      * Return {@code d} &times;
532      * 2<sup>{@code scale_factor}</sup> rounded as if performed
533      * by a single correctly rounded floating-point multiply to a
534      * member of the double value set.  See section 4.2.3 of
535      * <cite>The Java&trade; Language Specification</cite>
536      * for a discussion of floating-point
537      * value sets.  If the exponent of the result is between the
538      * {@code double}'s minimum exponent and maximum exponent,
539      * the answer is calculated exactly.  If the exponent of the
540      * result would be larger than {@code doubles}'s maximum
541      * exponent, an infinity is returned.  Note that if the result is
542      * subnormal, precision may be lost; that is, when {@code scalb(x,
543      * n)} is subnormal, {@code scalb(scalb(x, n), -n)} may
544      * not equal <i>x</i>.  When the result is non-NaN, the result has
545      * the same sign as {@code d}.
546      *
547      *<p>
548      * Special cases:
549      * <ul>
550      * <li> If the first argument is NaN, NaN is returned.
551      * <li> If the first argument is infinite, then an infinity of the
552      * same sign is returned.
553      * <li> If the first argument is zero, then a zero of the same
554      * sign is returned.
555      * </ul>
556      *
557      * @param d number to be scaled by a power of two.
558      * @param scale_factor power of 2 used to scale {@code d}
559      * @return {@code d * }2<sup>{@code scale_factor}</sup>
560      * @author Joseph D. Darcy
561      */
scalb(double d, int scale_factor)562     public static double scalb(double d, int scale_factor) {
563         /*
564          * This method does not need to be declared strictfp to
565          * compute the same correct result on all platforms.  When
566          * scaling up, it does not matter what order the
567          * multiply-store operations are done; the result will be
568          * finite or overflow regardless of the operation ordering.
569          * However, to get the correct result when scaling down, a
570          * particular ordering must be used.
571          *
572          * When scaling down, the multiply-store operations are
573          * sequenced so that it is not possible for two consecutive
574          * multiply-stores to return subnormal results.  If one
575          * multiply-store result is subnormal, the next multiply will
576          * round it away to zero.  This is done by first multiplying
577          * by 2 ^ (scale_factor % n) and then multiplying several
578          * times by by 2^n as needed where n is the exponent of number
579          * that is a covenient power of two.  In this way, at most one
580          * real rounding error occurs.  If the double value set is
581          * being used exclusively, the rounding will occur on a
582          * multiply.  If the double-extended-exponent value set is
583          * being used, the products will (perhaps) be exact but the
584          * stores to d are guaranteed to round to the double value
585          * set.
586          *
587          * It is _not_ a valid implementation to first multiply d by
588          * 2^MIN_EXPONENT and then by 2 ^ (scale_factor %
589          * MIN_EXPONENT) since even in a strictfp program double
590          * rounding on underflow could occur; e.g. if the scale_factor
591          * argument was (MIN_EXPONENT - n) and the exponent of d was a
592          * little less than -(MIN_EXPONENT - n), meaning the final
593          * result would be subnormal.
594          *
595          * Since exact reproducibility of this method can be achieved
596          * without any undue performance burden, there is no
597          * compelling reason to allow double rounding on underflow in
598          * scalb.
599          */
600 
601         // magnitude of a power of two so large that scaling a finite
602         // nonzero value by it would be guaranteed to over or
603         // underflow; due to rounding, scaling down takes takes an
604         // additional power of two which is reflected here
605         final int MAX_SCALE = DoubleConsts.MAX_EXPONENT + -DoubleConsts.MIN_EXPONENT +
606                               DoubleConsts.SIGNIFICAND_WIDTH + 1;
607         int exp_adjust = 0;
608         int scale_increment = 0;
609         double exp_delta = Double.NaN;
610 
611         // Make sure scaling factor is in a reasonable range
612 
613         if(scale_factor < 0) {
614             scale_factor = Math.max(scale_factor, -MAX_SCALE);
615             scale_increment = -512;
616             exp_delta = twoToTheDoubleScaleDown;
617         }
618         else {
619             scale_factor = Math.min(scale_factor, MAX_SCALE);
620             scale_increment = 512;
621             exp_delta = twoToTheDoubleScaleUp;
622         }
623 
624         // Calculate (scale_factor % +/-512), 512 = 2^9, using
625         // technique from "Hacker's Delight" section 10-2.
626         int t = (scale_factor >> 9-1) >>> 32 - 9;
627         exp_adjust = ((scale_factor + t) & (512 -1)) - t;
628 
629         d *= powerOfTwoD(exp_adjust);
630         scale_factor -= exp_adjust;
631 
632         while(scale_factor != 0) {
633             d *= exp_delta;
634             scale_factor -= scale_increment;
635         }
636         return d;
637     }
638 
639     /**
640      * Return {@code f} &times;
641      * 2<sup>{@code scale_factor}</sup> rounded as if performed
642      * by a single correctly rounded floating-point multiply to a
643      * member of the float value set.  See section 4.2.3 of
644      * <cite>The Java&trade; Language Specification</cite>
645      * for a discussion of floating-point
646      * value sets. If the exponent of the result is between the
647      * {@code float}'s minimum exponent and maximum exponent, the
648      * answer is calculated exactly.  If the exponent of the result
649      * would be larger than {@code float}'s maximum exponent, an
650      * infinity is returned.  Note that if the result is subnormal,
651      * precision may be lost; that is, when {@code scalb(x, n)}
652      * is subnormal, {@code scalb(scalb(x, n), -n)} may not equal
653      * <i>x</i>.  When the result is non-NaN, the result has the same
654      * sign as {@code f}.
655      *
656      *<p>
657      * Special cases:
658      * <ul>
659      * <li> If the first argument is NaN, NaN is returned.
660      * <li> If the first argument is infinite, then an infinity of the
661      * same sign is returned.
662      * <li> If the first argument is zero, then a zero of the same
663      * sign is returned.
664      * </ul>
665      *
666      * @param f number to be scaled by a power of two.
667      * @param scale_factor power of 2 used to scale {@code f}
668      * @return {@code f * }2<sup>{@code scale_factor}</sup>
669      * @author Joseph D. Darcy
670      */
scalb(float f, int scale_factor)671      public static float scalb(float f, int scale_factor) {
672         // magnitude of a power of two so large that scaling a finite
673         // nonzero value by it would be guaranteed to over or
674         // underflow; due to rounding, scaling down takes takes an
675         // additional power of two which is reflected here
676         final int MAX_SCALE = FloatConsts.MAX_EXPONENT + -FloatConsts.MIN_EXPONENT +
677                               FloatConsts.SIGNIFICAND_WIDTH + 1;
678 
679         // Make sure scaling factor is in a reasonable range
680         scale_factor = Math.max(Math.min(scale_factor, MAX_SCALE), -MAX_SCALE);
681 
682         /*
683          * Since + MAX_SCALE for float fits well within the double
684          * exponent range and + float -> double conversion is exact
685          * the multiplication below will be exact. Therefore, the
686          * rounding that occurs when the double product is cast to
687          * float will be the correctly rounded float result.  Since
688          * all operations other than the final multiply will be exact,
689          * it is not necessary to declare this method strictfp.
690          */
691         return (float)((double)f*powerOfTwoD(scale_factor));
692     }
693 
694     /**
695      * Returns the floating-point number adjacent to the first
696      * argument in the direction of the second argument.  If both
697      * arguments compare as equal the second argument is returned.
698      *
699      * <p>
700      * Special cases:
701      * <ul>
702      * <li> If either argument is a NaN, then NaN is returned.
703      *
704      * <li> If both arguments are signed zeros, {@code direction}
705      * is returned unchanged (as implied by the requirement of
706      * returning the second argument if the arguments compare as
707      * equal).
708      *
709      * <li> If {@code start} is
710      * &plusmn;{@code Double.MIN_VALUE} and {@code direction}
711      * has a value such that the result should have a smaller
712      * magnitude, then a zero with the same sign as {@code start}
713      * is returned.
714      *
715      * <li> If {@code start} is infinite and
716      * {@code direction} has a value such that the result should
717      * have a smaller magnitude, {@code Double.MAX_VALUE} with the
718      * same sign as {@code start} is returned.
719      *
720      * <li> If {@code start} is equal to &plusmn;
721      * {@code Double.MAX_VALUE} and {@code direction} has a
722      * value such that the result should have a larger magnitude, an
723      * infinity with same sign as {@code start} is returned.
724      * </ul>
725      *
726      * @param start     starting floating-point value
727      * @param direction value indicating which of
728      * {@code start}'s neighbors or {@code start} should
729      * be returned
730      * @return The floating-point number adjacent to {@code start} in the
731      * direction of {@code direction}.
732      * @author Joseph D. Darcy
733      */
nextAfter(double start, double direction)734     public static double nextAfter(double start, double direction) {
735         /*
736          * The cases:
737          *
738          * nextAfter(+infinity, 0)  == MAX_VALUE
739          * nextAfter(+infinity, +infinity)  == +infinity
740          * nextAfter(-infinity, 0)  == -MAX_VALUE
741          * nextAfter(-infinity, -infinity)  == -infinity
742          *
743          * are naturally handled without any additional testing
744          */
745 
746         // First check for NaN values
747         if (isNaN(start) || isNaN(direction)) {
748             // return a NaN derived from the input NaN(s)
749             return start + direction;
750         } else if (start == direction) {
751             return direction;
752         } else {        // start > direction or start < direction
753             // Add +0.0 to get rid of a -0.0 (+0.0 + -0.0 => +0.0)
754             // then bitwise convert start to integer.
755             long transducer = Double.doubleToRawLongBits(start + 0.0d);
756 
757             /*
758              * IEEE 754 floating-point numbers are lexicographically
759              * ordered if treated as signed- magnitude integers .
760              * Since Java's integers are two's complement,
761              * incrementing" the two's complement representation of a
762              * logically negative floating-point value *decrements*
763              * the signed-magnitude representation. Therefore, when
764              * the integer representation of a floating-point values
765              * is less than zero, the adjustment to the representation
766              * is in the opposite direction than would be expected at
767              * first .
768              */
769             if (direction > start) { // Calculate next greater value
770                 transducer = transducer + (transducer >= 0L ? 1L:-1L);
771             } else  { // Calculate next lesser value
772                 assert direction < start;
773                 if (transducer > 0L)
774                     --transducer;
775                 else
776                     if (transducer < 0L )
777                         ++transducer;
778                     /*
779                      * transducer==0, the result is -MIN_VALUE
780                      *
781                      * The transition from zero (implicitly
782                      * positive) to the smallest negative
783                      * signed magnitude value must be done
784                      * explicitly.
785                      */
786                     else
787                         transducer = DoubleConsts.SIGN_BIT_MASK | 1L;
788             }
789 
790             return Double.longBitsToDouble(transducer);
791         }
792     }
793 
794     /**
795      * Returns the floating-point number adjacent to the first
796      * argument in the direction of the second argument.  If both
797      * arguments compare as equal, the second argument is returned.
798      *
799      * <p>
800      * Special cases:
801      * <ul>
802      * <li> If either argument is a NaN, then NaN is returned.
803      *
804      * <li> If both arguments are signed zeros, a {@code float}
805      * zero with the same sign as {@code direction} is returned
806      * (as implied by the requirement of returning the second argument
807      * if the arguments compare as equal).
808      *
809      * <li> If {@code start} is
810      * &plusmn;{@code Float.MIN_VALUE} and {@code direction}
811      * has a value such that the result should have a smaller
812      * magnitude, then a zero with the same sign as {@code start}
813      * is returned.
814      *
815      * <li> If {@code start} is infinite and
816      * {@code direction} has a value such that the result should
817      * have a smaller magnitude, {@code Float.MAX_VALUE} with the
818      * same sign as {@code start} is returned.
819      *
820      * <li> If {@code start} is equal to &plusmn;
821      * {@code Float.MAX_VALUE} and {@code direction} has a
822      * value such that the result should have a larger magnitude, an
823      * infinity with same sign as {@code start} is returned.
824      * </ul>
825      *
826      * @param start     starting floating-point value
827      * @param direction value indicating which of
828      * {@code start}'s neighbors or {@code start} should
829      * be returned
830      * @return The floating-point number adjacent to {@code start} in the
831      * direction of {@code direction}.
832      * @author Joseph D. Darcy
833      */
nextAfter(float start, double direction)834      public static float nextAfter(float start, double direction) {
835         /*
836          * The cases:
837          *
838          * nextAfter(+infinity, 0)  == MAX_VALUE
839          * nextAfter(+infinity, +infinity)  == +infinity
840          * nextAfter(-infinity, 0)  == -MAX_VALUE
841          * nextAfter(-infinity, -infinity)  == -infinity
842          *
843          * are naturally handled without any additional testing
844          */
845 
846         // First check for NaN values
847         if (isNaN(start) || isNaN(direction)) {
848             // return a NaN derived from the input NaN(s)
849             return start + (float)direction;
850         } else if (start == direction) {
851             return (float)direction;
852         } else {        // start > direction or start < direction
853             // Add +0.0 to get rid of a -0.0 (+0.0 + -0.0 => +0.0)
854             // then bitwise convert start to integer.
855             int transducer = Float.floatToRawIntBits(start + 0.0f);
856 
857             /*
858              * IEEE 754 floating-point numbers are lexicographically
859              * ordered if treated as signed- magnitude integers .
860              * Since Java's integers are two's complement,
861              * incrementing" the two's complement representation of a
862              * logically negative floating-point value *decrements*
863              * the signed-magnitude representation. Therefore, when
864              * the integer representation of a floating-point values
865              * is less than zero, the adjustment to the representation
866              * is in the opposite direction than would be expected at
867              * first.
868              */
869             if (direction > start) {// Calculate next greater value
870                 transducer = transducer + (transducer >= 0 ? 1:-1);
871             } else  { // Calculate next lesser value
872                 assert direction < start;
873                 if (transducer > 0)
874                     --transducer;
875                 else
876                     if (transducer < 0 )
877                         ++transducer;
878                     /*
879                      * transducer==0, the result is -MIN_VALUE
880                      *
881                      * The transition from zero (implicitly
882                      * positive) to the smallest negative
883                      * signed magnitude value must be done
884                      * explicitly.
885                      */
886                     else
887                         transducer = FloatConsts.SIGN_BIT_MASK | 1;
888             }
889 
890             return Float.intBitsToFloat(transducer);
891         }
892     }
893 
894     /**
895      * Returns the floating-point value adjacent to {@code d} in
896      * the direction of positive infinity.  This method is
897      * semantically equivalent to {@code nextAfter(d,
898      * Double.POSITIVE_INFINITY)}; however, a {@code nextUp}
899      * implementation may run faster than its equivalent
900      * {@code nextAfter} call.
901      *
902      * <p>Special Cases:
903      * <ul>
904      * <li> If the argument is NaN, the result is NaN.
905      *
906      * <li> If the argument is positive infinity, the result is
907      * positive infinity.
908      *
909      * <li> If the argument is zero, the result is
910      * {@code Double.MIN_VALUE}
911      *
912      * </ul>
913      *
914      * @param d  starting floating-point value
915      * @return The adjacent floating-point value closer to positive
916      * infinity.
917      * @author Joseph D. Darcy
918      */
nextUp(double d)919     public static double nextUp(double d) {
920         if( isNaN(d) || d == Double.POSITIVE_INFINITY)
921             return d;
922         else {
923             d += 0.0d;
924             return Double.longBitsToDouble(Double.doubleToRawLongBits(d) +
925                                            ((d >= 0.0d)?+1L:-1L));
926         }
927     }
928 
929     /**
930      * Returns the floating-point value adjacent to {@code f} in
931      * the direction of positive infinity.  This method is
932      * semantically equivalent to {@code nextAfter(f,
933      * Double.POSITIVE_INFINITY)}; however, a {@code nextUp}
934      * implementation may run faster than its equivalent
935      * {@code nextAfter} call.
936      *
937      * <p>Special Cases:
938      * <ul>
939      * <li> If the argument is NaN, the result is NaN.
940      *
941      * <li> If the argument is positive infinity, the result is
942      * positive infinity.
943      *
944      * <li> If the argument is zero, the result is
945      * {@code Float.MIN_VALUE}
946      *
947      * </ul>
948      *
949      * @param f  starting floating-point value
950      * @return The adjacent floating-point value closer to positive
951      * infinity.
952      * @author Joseph D. Darcy
953      */
nextUp(float f)954      public static float nextUp(float f) {
955         if( isNaN(f) || f == FloatConsts.POSITIVE_INFINITY)
956             return f;
957         else {
958             f += 0.0f;
959             return Float.intBitsToFloat(Float.floatToRawIntBits(f) +
960                                         ((f >= 0.0f)?+1:-1));
961         }
962     }
963 
964     /**
965      * Returns the floating-point value adjacent to {@code d} in
966      * the direction of negative infinity.  This method is
967      * semantically equivalent to {@code nextAfter(d,
968      * Double.NEGATIVE_INFINITY)}; however, a
969      * {@code nextDown} implementation may run faster than its
970      * equivalent {@code nextAfter} call.
971      *
972      * <p>Special Cases:
973      * <ul>
974      * <li> If the argument is NaN, the result is NaN.
975      *
976      * <li> If the argument is negative infinity, the result is
977      * negative infinity.
978      *
979      * <li> If the argument is zero, the result is
980      * {@code -Double.MIN_VALUE}
981      *
982      * </ul>
983      *
984      * @param d  starting floating-point value
985      * @return The adjacent floating-point value closer to negative
986      * infinity.
987      * @author Joseph D. Darcy
988      */
nextDown(double d)989     public static double nextDown(double d) {
990         if( isNaN(d) || d == Double.NEGATIVE_INFINITY)
991             return d;
992         else {
993             if (d == 0.0)
994                 return -Double.MIN_VALUE;
995             else
996                 return Double.longBitsToDouble(Double.doubleToRawLongBits(d) +
997                                                ((d > 0.0d)?-1L:+1L));
998         }
999     }
1000 
1001     /**
1002      * Returns the floating-point value adjacent to {@code f} in
1003      * the direction of negative infinity.  This method is
1004      * semantically equivalent to {@code nextAfter(f,
1005      * Float.NEGATIVE_INFINITY)}; however, a
1006      * {@code nextDown} implementation may run faster than its
1007      * equivalent {@code nextAfter} call.
1008      *
1009      * <p>Special Cases:
1010      * <ul>
1011      * <li> If the argument is NaN, the result is NaN.
1012      *
1013      * <li> If the argument is negative infinity, the result is
1014      * negative infinity.
1015      *
1016      * <li> If the argument is zero, the result is
1017      * {@code -Float.MIN_VALUE}
1018      *
1019      * </ul>
1020      *
1021      * @param f  starting floating-point value
1022      * @return The adjacent floating-point value closer to negative
1023      * infinity.
1024      * @author Joseph D. Darcy
1025      */
nextDown(float f)1026     public static double nextDown(float f) {
1027         if( isNaN(f) || f == Float.NEGATIVE_INFINITY)
1028             return f;
1029         else {
1030             if (f == 0.0f)
1031                 return -Float.MIN_VALUE;
1032             else
1033                 return Float.intBitsToFloat(Float.floatToRawIntBits(f) +
1034                                             ((f > 0.0f)?-1:+1));
1035         }
1036     }
1037 
1038     /**
1039      * Returns the first floating-point argument with the sign of the
1040      * second floating-point argument.  For this method, a NaN
1041      * {@code sign} argument is always treated as if it were
1042      * positive.
1043      *
1044      * @param magnitude  the parameter providing the magnitude of the result
1045      * @param sign   the parameter providing the sign of the result
1046      * @return a value with the magnitude of {@code magnitude}
1047      * and the sign of {@code sign}.
1048      * @author Joseph D. Darcy
1049      * @since 1.5
1050      */
copySign(double magnitude, double sign)1051     public static double copySign(double magnitude, double sign) {
1052         return rawCopySign(magnitude, (isNaN(sign)?1.0d:sign));
1053     }
1054 
1055     /**
1056      * Returns the first floating-point argument with the sign of the
1057      * second floating-point argument.  For this method, a NaN
1058      * {@code sign} argument is always treated as if it were
1059      * positive.
1060      *
1061      * @param magnitude  the parameter providing the magnitude of the result
1062      * @param sign   the parameter providing the sign of the result
1063      * @return a value with the magnitude of {@code magnitude}
1064      * and the sign of {@code sign}.
1065      * @author Joseph D. Darcy
1066      */
copySign(float magnitude, float sign)1067      public static float copySign(float magnitude, float sign) {
1068         return rawCopySign(magnitude, (isNaN(sign)?1.0f:sign));
1069     }
1070 
1071     /**
1072      * Returns the size of an ulp of the argument.  An ulp of a
1073      * {@code double} value is the positive distance between this
1074      * floating-point value and the {@code double} value next
1075      * larger in magnitude.  Note that for non-NaN <i>x</i>,
1076      * <code>ulp(-<i>x</i>) == ulp(<i>x</i>)</code>.
1077      *
1078      * <p>Special Cases:
1079      * <ul>
1080      * <li> If the argument is NaN, then the result is NaN.
1081      * <li> If the argument is positive or negative infinity, then the
1082      * result is positive infinity.
1083      * <li> If the argument is positive or negative zero, then the result is
1084      * {@code Double.MIN_VALUE}.
1085      * <li> If the argument is &plusmn;{@code Double.MAX_VALUE}, then
1086      * the result is equal to 2<sup>971</sup>.
1087      * </ul>
1088      *
1089      * @param d the floating-point value whose ulp is to be returned
1090      * @return the size of an ulp of the argument
1091      * @author Joseph D. Darcy
1092      * @since 1.5
1093      */
ulp(double d)1094     public static double ulp(double d) {
1095         int exp = getExponent(d);
1096 
1097         switch(exp) {
1098         case DoubleConsts.MAX_EXPONENT+1:       // NaN or infinity
1099             return Math.abs(d);
1100 
1101         case DoubleConsts.MIN_EXPONENT-1:       // zero or subnormal
1102             return Double.MIN_VALUE;
1103 
1104         default:
1105             assert exp <= DoubleConsts.MAX_EXPONENT && exp >= DoubleConsts.MIN_EXPONENT;
1106 
1107             // ulp(x) is usually 2^(SIGNIFICAND_WIDTH-1)*(2^ilogb(x))
1108             exp = exp - (DoubleConsts.SIGNIFICAND_WIDTH-1);
1109             if (exp >= DoubleConsts.MIN_EXPONENT) {
1110                 return powerOfTwoD(exp);
1111             }
1112             else {
1113                 // return a subnormal result; left shift integer
1114                 // representation of Double.MIN_VALUE appropriate
1115                 // number of positions
1116                 return Double.longBitsToDouble(1L <<
1117                 (exp - (DoubleConsts.MIN_EXPONENT - (DoubleConsts.SIGNIFICAND_WIDTH-1)) ));
1118             }
1119         }
1120     }
1121 
1122     /**
1123      * Returns the size of an ulp of the argument.  An ulp of a
1124      * {@code float} value is the positive distance between this
1125      * floating-point value and the {@code float} value next
1126      * larger in magnitude.  Note that for non-NaN <i>x</i>,
1127      * <code>ulp(-<i>x</i>) == ulp(<i>x</i>)</code>.
1128      *
1129      * <p>Special Cases:
1130      * <ul>
1131      * <li> If the argument is NaN, then the result is NaN.
1132      * <li> If the argument is positive or negative infinity, then the
1133      * result is positive infinity.
1134      * <li> If the argument is positive or negative zero, then the result is
1135      * {@code Float.MIN_VALUE}.
1136      * <li> If the argument is &plusmn;{@code Float.MAX_VALUE}, then
1137      * the result is equal to 2<sup>104</sup>.
1138      * </ul>
1139      *
1140      * @param f the floating-point value whose ulp is to be returned
1141      * @return the size of an ulp of the argument
1142      * @author Joseph D. Darcy
1143      * @since 1.5
1144      */
ulp(float f)1145      public static float ulp(float f) {
1146         int exp = getExponent(f);
1147 
1148         switch(exp) {
1149         case FloatConsts.MAX_EXPONENT+1:        // NaN or infinity
1150             return Math.abs(f);
1151 
1152         case FloatConsts.MIN_EXPONENT-1:        // zero or subnormal
1153             return FloatConsts.MIN_VALUE;
1154 
1155         default:
1156             assert exp <= FloatConsts.MAX_EXPONENT && exp >= FloatConsts.MIN_EXPONENT;
1157 
1158             // ulp(x) is usually 2^(SIGNIFICAND_WIDTH-1)*(2^ilogb(x))
1159             exp = exp - (FloatConsts.SIGNIFICAND_WIDTH-1);
1160             if (exp >= FloatConsts.MIN_EXPONENT) {
1161                 return powerOfTwoF(exp);
1162             }
1163             else {
1164                 // return a subnormal result; left shift integer
1165                 // representation of FloatConsts.MIN_VALUE appropriate
1166                 // number of positions
1167                 return Float.intBitsToFloat(1 <<
1168                 (exp - (FloatConsts.MIN_EXPONENT - (FloatConsts.SIGNIFICAND_WIDTH-1)) ));
1169             }
1170         }
1171      }
1172 
1173     /**
1174      * Returns the signum function of the argument; zero if the argument
1175      * is zero, 1.0 if the argument is greater than zero, -1.0 if the
1176      * argument is less than zero.
1177      *
1178      * <p>Special Cases:
1179      * <ul>
1180      * <li> If the argument is NaN, then the result is NaN.
1181      * <li> If the argument is positive zero or negative zero, then the
1182      *      result is the same as the argument.
1183      * </ul>
1184      *
1185      * @param d the floating-point value whose signum is to be returned
1186      * @return the signum function of the argument
1187      * @author Joseph D. Darcy
1188      * @since 1.5
1189      */
signum(double d)1190     public static double signum(double d) {
1191         return (d == 0.0 || isNaN(d))?d:copySign(1.0, d);
1192     }
1193 
1194     /**
1195      * Returns the signum function of the argument; zero if the argument
1196      * is zero, 1.0f if the argument is greater than zero, -1.0f if the
1197      * argument is less than zero.
1198      *
1199      * <p>Special Cases:
1200      * <ul>
1201      * <li> If the argument is NaN, then the result is NaN.
1202      * <li> If the argument is positive zero or negative zero, then the
1203      *      result is the same as the argument.
1204      * </ul>
1205      *
1206      * @param f the floating-point value whose signum is to be returned
1207      * @return the signum function of the argument
1208      * @author Joseph D. Darcy
1209      * @since 1.5
1210      */
signum(float f)1211     public static float signum(float f) {
1212         return (f == 0.0f || isNaN(f))?f:copySign(1.0f, f);
1213     }
1214 
1215 }
1216