1 // Copyright 2015, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 //   * Redistributions of source code must retain the above copyright notice,
8 //     this list of conditions and the following disclaimer.
9 //   * Redistributions in binary form must reproduce the above copyright notice,
10 //     this list of conditions and the following disclaimer in the documentation
11 //     and/or other materials provided with the distribution.
12 //   * Neither the name of ARM Limited nor the names of its contributors may be
13 //     used to endorse or promote products derived from this software without
14 //     specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 
27 #include <cstdio>
28 
29 #include "utils-vixl.h"
30 
31 namespace vixl {
32 
33 // The default NaN values (for FPCR.DN=1).
34 const double kFP64DefaultNaN = RawbitsToDouble(UINT64_C(0x7ff8000000000000));
35 const float kFP32DefaultNaN = RawbitsToFloat(0x7fc00000);
36 const Float16 kFP16DefaultNaN = RawbitsToFloat16(0x7e00);
37 
38 // Floating-point zero values.
39 const Float16 kFP16PositiveZero = RawbitsToFloat16(0x0);
40 const Float16 kFP16NegativeZero = RawbitsToFloat16(0x8000);
41 
42 // Floating-point infinity values.
43 const Float16 kFP16PositiveInfinity = RawbitsToFloat16(0x7c00);
44 const Float16 kFP16NegativeInfinity = RawbitsToFloat16(0xfc00);
45 const float kFP32PositiveInfinity = RawbitsToFloat(0x7f800000);
46 const float kFP32NegativeInfinity = RawbitsToFloat(0xff800000);
47 const double kFP64PositiveInfinity =
48     RawbitsToDouble(UINT64_C(0x7ff0000000000000));
49 const double kFP64NegativeInfinity =
50     RawbitsToDouble(UINT64_C(0xfff0000000000000));
51 
IsZero(Float16 value)52 bool IsZero(Float16 value) {
53   uint16_t bits = Float16ToRawbits(value);
54   return (bits == Float16ToRawbits(kFP16PositiveZero) ||
55           bits == Float16ToRawbits(kFP16NegativeZero));
56 }
57 
Float16ToRawbits(Float16 value)58 uint16_t Float16ToRawbits(Float16 value) { return value.rawbits_; }
59 
FloatToRawbits(float value)60 uint32_t FloatToRawbits(float value) {
61   uint32_t bits = 0;
62   memcpy(&bits, &value, 4);
63   return bits;
64 }
65 
66 
DoubleToRawbits(double value)67 uint64_t DoubleToRawbits(double value) {
68   uint64_t bits = 0;
69   memcpy(&bits, &value, 8);
70   return bits;
71 }
72 
73 
RawbitsToFloat16(uint16_t bits)74 Float16 RawbitsToFloat16(uint16_t bits) {
75   Float16 f;
76   f.rawbits_ = bits;
77   return f;
78 }
79 
80 
RawbitsToFloat(uint32_t bits)81 float RawbitsToFloat(uint32_t bits) {
82   float value = 0.0;
83   memcpy(&value, &bits, 4);
84   return value;
85 }
86 
87 
RawbitsToDouble(uint64_t bits)88 double RawbitsToDouble(uint64_t bits) {
89   double value = 0.0;
90   memcpy(&value, &bits, 8);
91   return value;
92 }
93 
94 
Float16Sign(internal::SimFloat16 val)95 uint32_t Float16Sign(internal::SimFloat16 val) {
96   uint16_t rawbits = Float16ToRawbits(val);
97   return ExtractUnsignedBitfield32(15, 15, rawbits);
98 }
99 
100 
Float16Exp(internal::SimFloat16 val)101 uint32_t Float16Exp(internal::SimFloat16 val) {
102   uint16_t rawbits = Float16ToRawbits(val);
103   return ExtractUnsignedBitfield32(14, 10, rawbits);
104 }
105 
Float16Mantissa(internal::SimFloat16 val)106 uint32_t Float16Mantissa(internal::SimFloat16 val) {
107   uint16_t rawbits = Float16ToRawbits(val);
108   return ExtractUnsignedBitfield32(9, 0, rawbits);
109 }
110 
111 
FloatSign(float val)112 uint32_t FloatSign(float val) {
113   uint32_t rawbits = FloatToRawbits(val);
114   return ExtractUnsignedBitfield32(31, 31, rawbits);
115 }
116 
117 
FloatExp(float val)118 uint32_t FloatExp(float val) {
119   uint32_t rawbits = FloatToRawbits(val);
120   return ExtractUnsignedBitfield32(30, 23, rawbits);
121 }
122 
123 
FloatMantissa(float val)124 uint32_t FloatMantissa(float val) {
125   uint32_t rawbits = FloatToRawbits(val);
126   return ExtractUnsignedBitfield32(22, 0, rawbits);
127 }
128 
129 
DoubleSign(double val)130 uint32_t DoubleSign(double val) {
131   uint64_t rawbits = DoubleToRawbits(val);
132   return static_cast<uint32_t>(ExtractUnsignedBitfield64(63, 63, rawbits));
133 }
134 
135 
DoubleExp(double val)136 uint32_t DoubleExp(double val) {
137   uint64_t rawbits = DoubleToRawbits(val);
138   return static_cast<uint32_t>(ExtractUnsignedBitfield64(62, 52, rawbits));
139 }
140 
141 
DoubleMantissa(double val)142 uint64_t DoubleMantissa(double val) {
143   uint64_t rawbits = DoubleToRawbits(val);
144   return ExtractUnsignedBitfield64(51, 0, rawbits);
145 }
146 
147 
Float16Pack(uint16_t sign,uint16_t exp,uint16_t mantissa)148 internal::SimFloat16 Float16Pack(uint16_t sign,
149                                  uint16_t exp,
150                                  uint16_t mantissa) {
151   uint16_t bits = (sign << 15) | (exp << 10) | mantissa;
152   return RawbitsToFloat16(bits);
153 }
154 
155 
FloatPack(uint32_t sign,uint32_t exp,uint32_t mantissa)156 float FloatPack(uint32_t sign, uint32_t exp, uint32_t mantissa) {
157   uint32_t bits = (sign << 31) | (exp << 23) | mantissa;
158   return RawbitsToFloat(bits);
159 }
160 
161 
DoublePack(uint64_t sign,uint64_t exp,uint64_t mantissa)162 double DoublePack(uint64_t sign, uint64_t exp, uint64_t mantissa) {
163   uint64_t bits = (sign << 63) | (exp << 52) | mantissa;
164   return RawbitsToDouble(bits);
165 }
166 
167 
Float16Classify(Float16 value)168 int Float16Classify(Float16 value) {
169   uint16_t bits = Float16ToRawbits(value);
170   uint16_t exponent_max = (1 << 5) - 1;
171   uint16_t exponent_mask = exponent_max << 10;
172   uint16_t mantissa_mask = (1 << 10) - 1;
173 
174   uint16_t exponent = (bits & exponent_mask) >> 10;
175   uint16_t mantissa = bits & mantissa_mask;
176   if (exponent == 0) {
177     if (mantissa == 0) {
178       return FP_ZERO;
179     }
180     return FP_SUBNORMAL;
181   } else if (exponent == exponent_max) {
182     if (mantissa == 0) {
183       return FP_INFINITE;
184     }
185     return FP_NAN;
186   }
187   return FP_NORMAL;
188 }
189 
190 
CountClearHalfWords(uint64_t imm,unsigned reg_size)191 unsigned CountClearHalfWords(uint64_t imm, unsigned reg_size) {
192   VIXL_ASSERT((reg_size % 8) == 0);
193   int count = 0;
194   for (unsigned i = 0; i < (reg_size / 16); i++) {
195     if ((imm & 0xffff) == 0) {
196       count++;
197     }
198     imm >>= 16;
199   }
200   return count;
201 }
202 
203 
BitCount(uint64_t value)204 int BitCount(uint64_t value) { return CountSetBits(value); }
205 
206 // Float16 definitions.
207 
Float16(double dvalue)208 Float16::Float16(double dvalue) {
209   rawbits_ =
210       Float16ToRawbits(FPToFloat16(dvalue, FPTieEven, kIgnoreDefaultNaN));
211 }
212 
213 namespace internal {
214 
operator -() const215 SimFloat16 SimFloat16::operator-() const {
216   return RawbitsToFloat16(rawbits_ ^ 0x8000);
217 }
218 
219 // SimFloat16 definitions.
operator +(SimFloat16 rhs) const220 SimFloat16 SimFloat16::operator+(SimFloat16 rhs) const {
221   return static_cast<double>(*this) + static_cast<double>(rhs);
222 }
223 
operator -(SimFloat16 rhs) const224 SimFloat16 SimFloat16::operator-(SimFloat16 rhs) const {
225   return static_cast<double>(*this) - static_cast<double>(rhs);
226 }
227 
operator *(SimFloat16 rhs) const228 SimFloat16 SimFloat16::operator*(SimFloat16 rhs) const {
229   return static_cast<double>(*this) * static_cast<double>(rhs);
230 }
231 
operator /(SimFloat16 rhs) const232 SimFloat16 SimFloat16::operator/(SimFloat16 rhs) const {
233   return static_cast<double>(*this) / static_cast<double>(rhs);
234 }
235 
operator <(SimFloat16 rhs) const236 bool SimFloat16::operator<(SimFloat16 rhs) const {
237   return static_cast<double>(*this) < static_cast<double>(rhs);
238 }
239 
operator >(SimFloat16 rhs) const240 bool SimFloat16::operator>(SimFloat16 rhs) const {
241   return static_cast<double>(*this) > static_cast<double>(rhs);
242 }
243 
operator ==(SimFloat16 rhs) const244 bool SimFloat16::operator==(SimFloat16 rhs) const {
245   if (IsNaN(*this) || IsNaN(rhs)) {
246     return false;
247   } else if (IsZero(rhs) && IsZero(*this)) {
248     // +0 and -0 should be treated as equal.
249     return true;
250   }
251   return this->rawbits_ == rhs.rawbits_;
252 }
253 
operator !=(SimFloat16 rhs) const254 bool SimFloat16::operator!=(SimFloat16 rhs) const { return !(*this == rhs); }
255 
operator ==(double rhs) const256 bool SimFloat16::operator==(double rhs) const {
257   return static_cast<double>(*this) == static_cast<double>(rhs);
258 }
259 
operator double() const260 SimFloat16::operator double() const {
261   return FPToDouble(*this, kIgnoreDefaultNaN);
262 }
263 
BitCount(Uint32 value)264 Int64 BitCount(Uint32 value) { return CountSetBits(value.Get()); }
265 
266 }  // namespace internal
267 
FPToFloat(Float16 value,UseDefaultNaN DN,bool * exception)268 float FPToFloat(Float16 value, UseDefaultNaN DN, bool* exception) {
269   uint16_t bits = Float16ToRawbits(value);
270   uint32_t sign = bits >> 15;
271   uint32_t exponent =
272       ExtractUnsignedBitfield32(kFloat16MantissaBits + kFloat16ExponentBits - 1,
273                                 kFloat16MantissaBits,
274                                 bits);
275   uint32_t mantissa =
276       ExtractUnsignedBitfield32(kFloat16MantissaBits - 1, 0, bits);
277 
278   switch (Float16Classify(value)) {
279     case FP_ZERO:
280       return (sign == 0) ? 0.0f : -0.0f;
281 
282     case FP_INFINITE:
283       return (sign == 0) ? kFP32PositiveInfinity : kFP32NegativeInfinity;
284 
285     case FP_SUBNORMAL: {
286       // Calculate shift required to put mantissa into the most-significant bits
287       // of the destination mantissa.
288       int shift = CountLeadingZeros(mantissa << (32 - 10));
289 
290       // Shift mantissa and discard implicit '1'.
291       mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits) + shift + 1;
292       mantissa &= (1 << kFloatMantissaBits) - 1;
293 
294       // Adjust the exponent for the shift applied, and rebias.
295       exponent = exponent - shift + (-15 + 127);
296       break;
297     }
298 
299     case FP_NAN:
300       if (IsSignallingNaN(value)) {
301         if (exception != NULL) {
302           *exception = true;
303         }
304       }
305       if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
306 
307       // Convert NaNs as the processor would:
308       //  - The sign is propagated.
309       //  - The payload (mantissa) is transferred entirely, except that the top
310       //    bit is forced to '1', making the result a quiet NaN. The unused
311       //    (low-order) payload bits are set to 0.
312       exponent = (1 << kFloatExponentBits) - 1;
313 
314       // Increase bits in mantissa, making low-order bits 0.
315       mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
316       mantissa |= 1 << 22;  // Force a quiet NaN.
317       break;
318 
319     case FP_NORMAL:
320       // Increase bits in mantissa, making low-order bits 0.
321       mantissa <<= (kFloatMantissaBits - kFloat16MantissaBits);
322 
323       // Change exponent bias.
324       exponent += (-15 + 127);
325       break;
326 
327     default:
328       VIXL_UNREACHABLE();
329   }
330   return RawbitsToFloat((sign << 31) | (exponent << kFloatMantissaBits) |
331                         mantissa);
332 }
333 
334 
FPToFloat(double value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)335 float FPToFloat(double value,
336                 FPRounding round_mode,
337                 UseDefaultNaN DN,
338                 bool* exception) {
339   // Only the FPTieEven rounding mode is implemented.
340   VIXL_ASSERT((round_mode == FPTieEven) || (round_mode == FPRoundOdd));
341   USE(round_mode);
342 
343   switch (std::fpclassify(value)) {
344     case FP_NAN: {
345       if (IsSignallingNaN(value)) {
346         if (exception != NULL) {
347           *exception = true;
348         }
349       }
350       if (DN == kUseDefaultNaN) return kFP32DefaultNaN;
351 
352       // Convert NaNs as the processor would:
353       //  - The sign is propagated.
354       //  - The payload (mantissa) is transferred as much as possible, except
355       //    that the top bit is forced to '1', making the result a quiet NaN.
356       uint64_t raw = DoubleToRawbits(value);
357 
358       uint32_t sign = raw >> 63;
359       uint32_t exponent = (1 << 8) - 1;
360       uint32_t payload =
361           static_cast<uint32_t>(ExtractUnsignedBitfield64(50, 52 - 23, raw));
362       payload |= (1 << 22);  // Force a quiet NaN.
363 
364       return RawbitsToFloat((sign << 31) | (exponent << 23) | payload);
365     }
366 
367     case FP_ZERO:
368     case FP_INFINITE: {
369       // In a C++ cast, any value representable in the target type will be
370       // unchanged. This is always the case for +/-0.0 and infinities.
371       return static_cast<float>(value);
372     }
373 
374     case FP_NORMAL:
375     case FP_SUBNORMAL: {
376       // Convert double-to-float as the processor would, assuming that FPCR.FZ
377       // (flush-to-zero) is not set.
378       uint64_t raw = DoubleToRawbits(value);
379       // Extract the IEEE-754 double components.
380       uint32_t sign = raw >> 63;
381       // Extract the exponent and remove the IEEE-754 encoding bias.
382       int32_t exponent =
383           static_cast<int32_t>(ExtractUnsignedBitfield64(62, 52, raw)) - 1023;
384       // Extract the mantissa and add the implicit '1' bit.
385       uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
386       if (std::fpclassify(value) == FP_NORMAL) {
387         mantissa |= (UINT64_C(1) << 52);
388       }
389       return FPRoundToFloat(sign, exponent, mantissa, round_mode);
390     }
391   }
392 
393   VIXL_UNREACHABLE();
394   return value;
395 }
396 
397 // TODO: We should consider implementing a full FPToDouble(Float16)
398 // conversion function (for performance reasons).
FPToDouble(Float16 value,UseDefaultNaN DN,bool * exception)399 double FPToDouble(Float16 value, UseDefaultNaN DN, bool* exception) {
400   // We can rely on implicit float to double conversion here.
401   return FPToFloat(value, DN, exception);
402 }
403 
404 
FPToDouble(float value,UseDefaultNaN DN,bool * exception)405 double FPToDouble(float value, UseDefaultNaN DN, bool* exception) {
406   switch (std::fpclassify(value)) {
407     case FP_NAN: {
408       if (IsSignallingNaN(value)) {
409         if (exception != NULL) {
410           *exception = true;
411         }
412       }
413       if (DN == kUseDefaultNaN) return kFP64DefaultNaN;
414 
415       // Convert NaNs as the processor would:
416       //  - The sign is propagated.
417       //  - The payload (mantissa) is transferred entirely, except that the top
418       //    bit is forced to '1', making the result a quiet NaN. The unused
419       //    (low-order) payload bits are set to 0.
420       uint32_t raw = FloatToRawbits(value);
421 
422       uint64_t sign = raw >> 31;
423       uint64_t exponent = (1 << 11) - 1;
424       uint64_t payload = ExtractUnsignedBitfield64(21, 0, raw);
425       payload <<= (52 - 23);           // The unused low-order bits should be 0.
426       payload |= (UINT64_C(1) << 51);  // Force a quiet NaN.
427 
428       return RawbitsToDouble((sign << 63) | (exponent << 52) | payload);
429     }
430 
431     case FP_ZERO:
432     case FP_NORMAL:
433     case FP_SUBNORMAL:
434     case FP_INFINITE: {
435       // All other inputs are preserved in a standard cast, because every value
436       // representable using an IEEE-754 float is also representable using an
437       // IEEE-754 double.
438       return static_cast<double>(value);
439     }
440   }
441 
442   VIXL_UNREACHABLE();
443   return static_cast<double>(value);
444 }
445 
446 
FPToFloat16(float value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)447 Float16 FPToFloat16(float value,
448                     FPRounding round_mode,
449                     UseDefaultNaN DN,
450                     bool* exception) {
451   // Only the FPTieEven rounding mode is implemented.
452   VIXL_ASSERT(round_mode == FPTieEven);
453   USE(round_mode);
454 
455   uint32_t raw = FloatToRawbits(value);
456   int32_t sign = raw >> 31;
457   int32_t exponent = ExtractUnsignedBitfield32(30, 23, raw) - 127;
458   uint32_t mantissa = ExtractUnsignedBitfield32(22, 0, raw);
459 
460   switch (std::fpclassify(value)) {
461     case FP_NAN: {
462       if (IsSignallingNaN(value)) {
463         if (exception != NULL) {
464           *exception = true;
465         }
466       }
467       if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
468 
469       // Convert NaNs as the processor would:
470       //  - The sign is propagated.
471       //  - The payload (mantissa) is transferred as much as possible, except
472       //    that the top bit is forced to '1', making the result a quiet NaN.
473       uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
474                                     : Float16ToRawbits(kFP16NegativeInfinity);
475       result |= mantissa >> (kFloatMantissaBits - kFloat16MantissaBits);
476       result |= (1 << 9);  // Force a quiet NaN;
477       return RawbitsToFloat16(result);
478     }
479 
480     case FP_ZERO:
481       return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
482 
483     case FP_INFINITE:
484       return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
485 
486     case FP_NORMAL:
487     case FP_SUBNORMAL: {
488       // Convert float-to-half as the processor would, assuming that FPCR.FZ
489       // (flush-to-zero) is not set.
490 
491       // Add the implicit '1' bit to the mantissa.
492       mantissa += (1 << 23);
493       return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
494     }
495   }
496 
497   VIXL_UNREACHABLE();
498   return kFP16PositiveZero;
499 }
500 
501 
FPToFloat16(double value,FPRounding round_mode,UseDefaultNaN DN,bool * exception)502 Float16 FPToFloat16(double value,
503                     FPRounding round_mode,
504                     UseDefaultNaN DN,
505                     bool* exception) {
506   // Only the FPTieEven rounding mode is implemented.
507   VIXL_ASSERT(round_mode == FPTieEven);
508   USE(round_mode);
509 
510   uint64_t raw = DoubleToRawbits(value);
511   int32_t sign = raw >> 63;
512   int64_t exponent = ExtractUnsignedBitfield64(62, 52, raw) - 1023;
513   uint64_t mantissa = ExtractUnsignedBitfield64(51, 0, raw);
514 
515   switch (std::fpclassify(value)) {
516     case FP_NAN: {
517       if (IsSignallingNaN(value)) {
518         if (exception != NULL) {
519           *exception = true;
520         }
521       }
522       if (DN == kUseDefaultNaN) return kFP16DefaultNaN;
523 
524       // Convert NaNs as the processor would:
525       //  - The sign is propagated.
526       //  - The payload (mantissa) is transferred as much as possible, except
527       //    that the top bit is forced to '1', making the result a quiet NaN.
528       uint16_t result = (sign == 0) ? Float16ToRawbits(kFP16PositiveInfinity)
529                                     : Float16ToRawbits(kFP16NegativeInfinity);
530       result |= mantissa >> (kDoubleMantissaBits - kFloat16MantissaBits);
531       result |= (1 << 9);  // Force a quiet NaN;
532       return RawbitsToFloat16(result);
533     }
534 
535     case FP_ZERO:
536       return (sign == 0) ? kFP16PositiveZero : kFP16NegativeZero;
537 
538     case FP_INFINITE:
539       return (sign == 0) ? kFP16PositiveInfinity : kFP16NegativeInfinity;
540     case FP_NORMAL:
541     case FP_SUBNORMAL: {
542       // Convert double-to-half as the processor would, assuming that FPCR.FZ
543       // (flush-to-zero) is not set.
544 
545       // Add the implicit '1' bit to the mantissa.
546       mantissa += (UINT64_C(1) << 52);
547       return FPRoundToFloat16(sign, exponent, mantissa, round_mode);
548     }
549   }
550 
551   VIXL_UNREACHABLE();
552   return kFP16PositiveZero;
553 }
554 
555 }  // namespace vixl
556