1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21 #include <cmath>
22 #undef _USE_MATH_DEFINES
23 
24 #include <string.h>
25 #include <algorithm>
26 #include <type_traits>
27 
28 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
29 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
30 #include "tensorflow/core/lib/random/philox_random.h"
31 
32 namespace tensorflow {
33 namespace random {
34 
35 // Helper function to convert a 16-bit integer to a half between [0..1).
36 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
37 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
38 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
39 // Helper function to convert a 32-bit integer to a float between [0..1).
40 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
41 // Helper function to convert two 32-bit integers to a double between [0..1).
42 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
43 
44 // Computes a + b. Requires that the result is representable in the destination
45 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
46 // need *not* be representable in that type. (The condition on b excludes the
47 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
48 // compute.)
49 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)50 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
51                                    typename std::make_unsigned<Int>::type b) {
52   // Implementation note: both b_div_2 and b - b_div_2 are positive and
53   // representatble as Int.
54   auto b_div_2 = b >> 1;
55   return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
56 }
57 
58 // A class that generates uniform distribution random numbers from the
59 // underlying random integer generator.
60 // Arguments:
61 //   Generator: a generator type that returns a number of uint32 upon each
62 //              invocation. It needs to define kResultElementCount for the
63 //              sample count for each invocation, and ResultType for the
64 //              actual returned sample type.
65 //   RealType: the data type of the real numbers that will be returned by the
66 //             distribution. This could be either float or double for now.
67 // This class is meant to be implemented through specialization. The default
68 // is not defined by design.
69 template <class Generator, typename RealType>
70 class UniformDistribution;
71 
72 template <class Generator>
73 class UniformDistribution<Generator, Eigen::half> {
74  public:
75   // The number of elements that will be returned.
76   static const int kResultElementCount = Generator::kResultElementCount;
77   // Cost of generation of a single element (in cycles).
78   static const int kElementCost = 3;
79   // Indicate that this distribution may take variable number of samples
80   // during the runtime.
81   static const bool kVariableSamplesPerOutput = false;
82   typedef Array<Eigen::half, kResultElementCount> ResultType;
83   typedef Eigen::half ResultElementType;
84 
85   PHILOX_DEVICE_INLINE
operator()86   ResultType operator()(Generator* gen) {
87     typename Generator::ResultType sample = (*gen)();
88     ResultType result;
89     for (int i = 0; i < kResultElementCount; ++i) {
90       result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
91     }
92     return result;
93   }
94 };
95 
96 template <class Generator>
97 class UniformDistribution<Generator, bfloat16> {
98  public:
99   // The number of elements that will be returned.
100   static const int kResultElementCount = Generator::kResultElementCount;
101   // Cost of generation of a single element (in cycles).
102   static const int kElementCost = 3;
103   // Indicate that this distribution may take variable number of samples
104   // during the runtime.
105   static const bool kVariableSamplesPerOutput = false;
106   typedef Array<bfloat16, kResultElementCount> ResultType;
107   typedef bfloat16 ResultElementType;
108 
109   PHILOX_DEVICE_INLINE
operator()110   ResultType operator()(Generator* gen) {
111     typename Generator::ResultType sample = (*gen)();
112     ResultType result;
113     for (int i = 0; i < kResultElementCount; ++i) {
114       result[i] = Uint16ToGfloat16(sample[i]);
115     }
116     return result;
117   }
118 };
119 
120 template <class Generator>
121 class UniformDistribution<Generator, float> {
122  public:
123   // The number of elements that will be returned.
124   static const int kResultElementCount = Generator::kResultElementCount;
125   // Cost of generation of a single element (in cycles).
126   static const int kElementCost = 3;
127   // Indicate that this distribution may take variable number of samples
128   // during the runtime.
129   static const bool kVariableSamplesPerOutput = false;
130   typedef Array<float, kResultElementCount> ResultType;
131   typedef float ResultElementType;
132 
133   PHILOX_DEVICE_INLINE
operator()134   ResultType operator()(Generator* gen) {
135     typename Generator::ResultType sample = (*gen)();
136     ResultType result;
137     for (int i = 0; i < kResultElementCount; ++i) {
138       result[i] = Uint32ToFloat(sample[i]);
139     }
140     return result;
141   }
142 };
143 
144 template <class Generator>
145 class UniformDistribution<Generator, double> {
146  public:
147   // The number of elements that will be returned.
148   static const int kResultElementCount = Generator::kResultElementCount / 2;
149   // Cost of generation of a single element (in cycles).
150   static const int kElementCost = 3;
151   // Indicate that this distribution may take variable number of samples
152   // during the runtime.
153   static const bool kVariableSamplesPerOutput = false;
154   typedef Array<double, kResultElementCount> ResultType;
155   typedef double ResultElementType;
156 
157   PHILOX_DEVICE_INLINE
operator()158   ResultType operator()(Generator* gen) {
159     typename Generator::ResultType sample = (*gen)();
160     ResultType result;
161     for (int i = 0; i < kResultElementCount; ++i) {
162       result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
163     }
164     return result;
165   }
166 };
167 
168 template <class Generator>
169 class UniformDistribution<Generator, int32> {
170  public:
171   // The number of elements that will be returned.
172   static const int kResultElementCount = Generator::kResultElementCount;
173   // Cost of generation of a single element (in cycles).
174   static const int kElementCost = 3;
175   // Indicate that this distribution may take variable number of samples
176   // during the runtime.
177   static const bool kVariableSamplesPerOutput = false;
178   typedef Array<int32, kResultElementCount> ResultType;
179   typedef int32 ResultElementType;
180 
181   // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)182   UniformDistribution(int32 lo, int32 hi)
183       : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
184 
185   PHILOX_DEVICE_INLINE
operator()186   ResultType operator()(Generator* gen) {
187     typename Generator::ResultType sample = (*gen)();
188     ResultType result;
189     for (int i = 0; i < kResultElementCount; ++i) {
190       result[i] = SignedAdd(lo_, sample[i] % range_);
191     }
192     return result;
193   }
194 
195  private:
196   // Note that lo_ is intentionally signed while range_ is intentionally
197   // unsigned.  This is because hi - lo can overflow signed integers if
198   // lo < 0 < hi, but always fits in unsigned.
199   int32 lo_;
200   uint32 range_;
201 };
202 
203 template <class Generator>
204 class UniformDistribution<Generator, int64> {
205  public:
206   // The number of elements that will be returned.
207   static const int kResultElementCount = Generator::kResultElementCount / 2;
208   // Cost of generation of a single element (in cycles).
209   static const int kElementCost = 3;
210   // Indicate that this distribution may take variable number of samples
211   // during the runtime.
212   static const bool kVariableSamplesPerOutput = false;
213   typedef Array<int64, kResultElementCount> ResultType;
214   typedef int64 ResultElementType;
215 
216   // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)217   UniformDistribution(int64 lo, int64 hi)
218       : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
219 
220   PHILOX_DEVICE_INLINE
operator()221   ResultType operator()(Generator* gen) {
222     typename Generator::ResultType sample = (*gen)();
223     ResultType result;
224     for (int i = 0; i < kResultElementCount; ++i) {
225       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
226       result[i] = SignedAdd(lo_, bits % range_);
227     }
228     return result;
229   }
230 
231  private:
232   // Note that lo_ is intentionally signed while range_ is intentionally
233   // unsigned.  This is because hi - lo can overflow signed integers if
234   // lo < 0 < hi, but always fits in unsigned.
235   int64 lo_;
236   uint64 range_;
237 };
238 
239 // A class that adapts the underlying native multiple samples to return a single
240 // sample at a time.
241 template <class Generator>
242 class SingleSampleAdapter {
243  public:
244   // The number of elements that will be returned.
245   static const int kResultElementCount = 1;
246   // The number of elements that will be returned by the underlying generator.
247   static const int kNativeElementCount = Generator::kResultElementCount;
248   typedef typename Generator::ResultElementType ResultType;
249   typedef typename Generator::ResultElementType ResultElementType;
250 
251   PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)252   explicit SingleSampleAdapter(Generator* gen)
253       : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
254 
255   PHILOX_DEVICE_INLINE
operator()256   ResultType operator()() {
257     if (used_result_index_ == Generator::kResultElementCount) {
258       unused_results_ = (*generator_)();
259       used_result_index_ = 0;
260     }
261 
262     return unused_results_[used_result_index_++];
263   }
264 
265   PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)266   void Skip(uint64 num_skips) {
267     if (!num_skips) {
268       return;
269     }
270     int num_unused_results = kNativeElementCount - used_result_index_;
271     if (num_skips <= num_unused_results) {
272       used_result_index_ += num_skips;
273       return;
274     }
275     num_skips -= num_unused_results;
276     used_result_index_ = kNativeElementCount;
277     SkipFromGenerator(num_skips / kNativeElementCount);
278     num_skips = num_skips % kNativeElementCount;
279     if (num_skips) {
280       unused_results_ = (*generator_)();
281       used_result_index_ = num_skips;
282     }
283   }
284 
285  private:
286   // This implementation iteratively skips over `num_skips` samples
287   // from `generator_`. There is an O(1) implementation for PhiloxRandom
288   // in random_distributions.cc.
289   PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)290   void SkipFromGenerator(uint64 num_skips) {
291     while (num_skips--) {
292       (*generator_)();
293     }
294   }
295 
296   Generator* generator_;
297   typename Generator::ResultType unused_results_;
298   int used_result_index_;
299 };
300 
301 // A class that generates unit normal distribution random numbers from the
302 // underlying random integer generator.
303 // Arguments:
304 //   Generator: a generator type that returns a number of uint32 upon each
305 //              each invocation. It needs to define kResultElementCount for the
306 //              sample count for each invocation, and ResultType for actual
307 //              returned sample type.
308 //   RealType: the data type of the real numbers that will be returned by the
309 //             distribution. This could be either float or double for now.
310 // This class is meant to be implemented through specialization. The default
311 // is not defined by design.
312 template <class Generator, typename RealType>
313 class NormalDistribution;
314 
315 PHILOX_DEVICE_INLINE
316 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
317 
318 PHILOX_DEVICE_INLINE
319 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
320                      double* d1);
321 
322 // Exactly like the float version, except that we convert to half afterwards;
323 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
324 // gain from working in half internally.
325 template <class Generator>
326 class NormalDistribution<Generator, Eigen::half> {
327  public:
328   // The number of elements that will be returned.
329   static const int kResultElementCount = Generator::kResultElementCount;
330   // Cost of generation of a single element (in cycles).
331   static const int kElementCost = 70;
332   // Indicate that this distribution may take variable number of samples
333   // during the runtime.
334   static const bool kVariableSamplesPerOutput = false;
335   typedef Array<Eigen::half, kResultElementCount> ResultType;
336   typedef Eigen::half ResultElementType;
337 
338   PHILOX_DEVICE_INLINE
operator()339   ResultType operator()(Generator* gen) {
340     typename Generator::ResultType sample = (*gen)();
341     ResultType result;
342     for (int i = 0; i < kResultElementCount; i += 2) {
343       float f[2];
344       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
345       result[i] = Eigen::half(f[0]);
346       result[i + 1] = Eigen::half(f[1]);
347     }
348     return result;
349   }
350 };
351 
352 template <class Generator>
353 class NormalDistribution<Generator, bfloat16> {
354  public:
355   // The number of elements that will be returned.
356   static const int kResultElementCount = Generator::kResultElementCount;
357   // Cost of generation of a single element (in cycles).
358   static const int kElementCost = 70;
359   // Indicate that this distribution may take variable number of samples
360   // during the runtime.
361   static const bool kVariableSamplesPerOutput = false;
362   typedef Array<bfloat16, kResultElementCount> ResultType;
363   typedef bfloat16 ResultElementType;
364 
365   PHILOX_DEVICE_INLINE
operator()366   ResultType operator()(Generator* gen) {
367     typename Generator::ResultType sample = (*gen)();
368     ResultType result;
369     static_assert(kResultElementCount % 2 == 0,
370                   "kResultElementCount should be an even number");
371     for (int i = 0; i < kResultElementCount; i += 2) {
372       float f[2];
373       // Box-Muller transform requires processing 2 elements at a time.
374       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
375       result[i] = bfloat16(f[0]);
376       result[i + 1] = bfloat16(f[1]);
377     }
378     return result;
379   }
380 };
381 
382 template <class Generator>
383 class NormalDistribution<Generator, float> {
384  public:
385   // The number of elements that will be returned.
386   static const int kResultElementCount = Generator::kResultElementCount;
387   // Cost of generation of a single element (in cycles).
388   static const int kElementCost = 70;
389   // Indicate that this distribution may take variable number of samples
390   // during the runtime.
391   static const bool kVariableSamplesPerOutput = false;
392   typedef Array<float, kResultElementCount> ResultType;
393   typedef float ResultElementType;
394 
395   PHILOX_DEVICE_INLINE
operator()396   ResultType operator()(Generator* gen) {
397     typename Generator::ResultType sample = (*gen)();
398     ResultType result;
399     for (int i = 0; i < kResultElementCount; i += 2) {
400       BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
401     }
402     return result;
403   }
404 };
405 
406 template <class Generator>
407 class NormalDistribution<Generator, double> {
408  public:
409   // The number of elements that will be returned.
410   static const int kResultElementCount = Generator::kResultElementCount / 2;
411   // Cost of generation of a single element (in cycles).
412   static const int kElementCost = 70;
413   // Indicate that this distribution may take variable number of samples
414   // during the runtime.
415   static const bool kVariableSamplesPerOutput = false;
416   typedef Array<double, kResultElementCount> ResultType;
417   typedef double ResultElementType;
418 
419   PHILOX_DEVICE_INLINE
operator()420   ResultType operator()(Generator* gen) {
421     typename Generator::ResultType sample = (*gen)();
422     ResultType result;
423     for (int i = 0; i < kResultElementCount; i += 2) {
424       const int i2 = 2 * i;
425       BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
426                       sample[i2 + 3], &result[i], &result[i + 1]);
427     }
428     return result;
429   }
430 };
431 
432 // A class that returns standard normal distribution between
433 // [-kTruncateValue, kTruncateValue].
434 // Arguments:
435 //   Generator: a generator type that returns a number of uint32 upon each
436 //              each invocation. It needs to define kResultElementCount for the
437 //              sample count for each invocation, and ResultType for actual
438 //              returned sample type.
439 //   RealType: the data type of the real numbers that will be returned by the
440 //             distribution. This could be either float or double for now.
441 // This class is meant to be implemented through specialization. The default
442 // is not defined by design.
443 template <class SingleSampleGenerator, typename RealType>
444 class TruncatedNormalDistribution;
445 
446 // Exactly like the float version, except that we convert to half afterwards;
447 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
448 // gain from working in half internally.
449 template <class SingleSampleGenerator>
450 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
451  public:
452   // The number of elements that will be returned.
453   static const int kResultElementCount =
454       SingleSampleGenerator::kNativeElementCount;
455   // Cost of generation of a single element (in cycles).
456   static const int kElementCost = 90;
457   // Indicate that this distribution may take variable number of samples
458   // during the runtime.
459   static const bool kVariableSamplesPerOutput = true;
460   // The threshold where the normal distribution is truncated.
461   const float kTruncateValue = 2.0f;
462 
463   typedef Array<Eigen::half, kResultElementCount> ResultType;
464   typedef Eigen::half ResultElementType;
465 
466   PHILOX_DEVICE_INLINE
operator()467   ResultType operator()(SingleSampleGenerator* gen) {
468     ResultType results;
469     int index = 0;
470     while (true) {
471       // Repeatedly take samples from the normal distribution, until we have
472       // the desired number of elements that fall within the pre-defined cutoff
473       // threshold.
474       const uint32 x0 = (*gen)();
475       const uint32 x1 = (*gen)();
476       float f[2];
477       BoxMullerFloat(x0, x1, &f[0], &f[1]);
478 
479       for (int i = 0; i < 2; ++i) {
480         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
481           results[index++] = Eigen::half(f[i]);
482           if (index >= kResultElementCount) {
483             return results;
484           }
485         }
486       }
487     }
488   }
489 };
490 
491 template <class SingleSampleGenerator>
492 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
493  public:
494   // The number of elements that will be returned.
495   static const int kResultElementCount =
496       SingleSampleGenerator::kNativeElementCount;
497   // Cost of generation of a single element (in cycles).
498   static const int kElementCost = 90;
499   // Indicate that this distribution may take variable number of samples
500   // during the runtime.
501   static const bool kVariableSamplesPerOutput = true;
502   // The threshold where the normal distribution is truncated.
503   const float kTruncateValue = 2.0f;
504 
505   typedef Array<bfloat16, kResultElementCount> ResultType;
506   typedef bfloat16 ResultElementType;
507 
508   PHILOX_DEVICE_INLINE
operator()509   ResultType operator()(SingleSampleGenerator* gen) {
510     ResultType results;
511     int index = 0;
512     while (true) {
513       // Repeatedly take samples from the normal distribution, until we have
514       // the desired number of elements that fall within the pre-defined cutoff
515       // threshold.
516       const uint32 x0 = (*gen)();
517       const uint32 x1 = (*gen)();
518       float f[2];
519       BoxMullerFloat(x0, x1, &f[0], &f[1]);
520 
521       for (int i = 0; i < 2; ++i) {
522         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
523           results[index++] = bfloat16(f[i]);
524           if (index >= kResultElementCount) {
525             return results;
526           }
527         }
528       }
529     }
530   }
531 };
532 
533 // Partial specialization for float.
534 template <class SingleSampleGenerator>
535 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
536  public:
537   // The number of elements that will be returned.
538   static const int kResultElementCount =
539       SingleSampleGenerator::kNativeElementCount;
540   // Cost of generation of a single element (in cycles).
541   static const int kElementCost = 90;
542   // Indicate that this distribution may take variable number of samples
543   // during the runtime.
544   static const bool kVariableSamplesPerOutput = true;
545   // The threshold where the normal distribution is truncated.
546   const float kTruncateValue = 2.0f;
547 
548   typedef Array<float, kResultElementCount> ResultType;
549   typedef float ResultElementType;
550 
551   PHILOX_DEVICE_INLINE
operator()552   ResultType operator()(SingleSampleGenerator* gen) {
553     ResultType results;
554     int index = 0;
555     while (true) {
556       // Repeatedly take samples from the normal distribution, until we have
557       // the desired number of elements that fall within the pre-defined cutoff
558       // threshold.
559       const uint32 x0 = (*gen)();
560       const uint32 x1 = (*gen)();
561       float f[2];
562       BoxMullerFloat(x0, x1, &f[0], &f[1]);
563 
564       for (int i = 0; i < 2; ++i) {
565         if (Eigen::numext::abs(f[i]) < kTruncateValue) {
566           results[index++] = f[i];
567           if (index >= kResultElementCount) {
568             return results;
569           }
570         }
571       }
572     }
573   }
574 };
575 
576 // Partial specialization for double.
577 template <class SingleSampleGenerator>
578 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
579  public:
580   // The number of elements that will be returned.
581   static const int kResultElementCount =
582       (SingleSampleGenerator::kNativeElementCount > 1)
583           ? SingleSampleGenerator::kNativeElementCount / 2
584           : 1;
585   // Cost of generation of a single element (in cycles).
586   static const int kElementCost = 90;
587   // Indicate that this distribution may take variable number of samples
588   // during the runtime.
589   static const bool kVariableSamplesPerOutput = true;
590   typedef Array<double, kResultElementCount> ResultType;
591   typedef double ResultElementType;
592   const double kTruncateValue = 2.0;
593 
594   PHILOX_DEVICE_INLINE
operator()595   ResultType operator()(SingleSampleGenerator* gen) {
596     ResultType results;
597     int index = 0;
598     while (1) {
599       const uint32 x0 = (*gen)();
600       const uint32 x1 = (*gen)();
601       const uint32 x2 = (*gen)();
602       const uint32 x3 = (*gen)();
603       double d[2];
604       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
605 
606       for (int i = 0; i < 2; ++i) {
607         if (Eigen::numext::abs(d[i]) < kTruncateValue) {
608           results[index++] = d[i];
609           if (index >= kResultElementCount) {
610             return results;
611           }
612         }
613       }
614     }
615   }
616 };
617 
618 // Helper function to convert two 32-bit uniform integers to two floats
619 // under the unit normal distribution.
620 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)621 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
622   // This function implements the Box-Muller transform:
623   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
624   // Do not send a really small number to log().
625   // We cannot mark "epsilon" as "static const" because NVCC would complain
626   const float epsilon = 1.0e-7f;
627   float u1 = Uint32ToFloat(x0);
628   if (u1 < epsilon) {
629     u1 = epsilon;
630   }
631   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
632   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
633 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
634   *f0 = Eigen::numext::sin(v1);
635   *f1 = Eigen::numext::cos(v1);
636 #else
637   sincosf(v1, f0, f1);
638 #endif
639   *f0 *= u2;
640   *f1 *= u2;
641 }
642 
643 // Helper function to convert four 32-bit uniform integers to two doubles
644 // under the unit normal distribution.
645 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)646 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
647                      double* d1) {
648   // This function implements the Box-Muller transform:
649   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
650   // Do not send a really small number to log().
651   // We cannot mark "epsilon" as "static const" because NVCC would complain
652   const double epsilon = 1.0e-7;
653   double u1 = Uint64ToDouble(x0, x1);
654   if (u1 < epsilon) {
655     u1 = epsilon;
656   }
657   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
658   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
659 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
660   *d0 = Eigen::numext::sin(v1);
661   *d1 = Eigen::numext::cos(v1);
662 #else
663   sincos(v1, d0, d1);
664 #endif
665   *d0 *= u2;
666   *d1 *= u2;
667 }
668 
669 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)670 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
671   // IEEE754 halfs are formatted as follows (MSB first):
672   //    sign(1) exponent(5) mantissa(10)
673   // Conceptually construct the following:
674   //    sign == 0
675   //    exponent == 15  -- an excess 15 representation of a zero exponent
676   //    mantissa == 10 random bits
677   const uint16 man = x & 0x3ffu;  // 10 bit mantissa
678   const uint16 exp = static_cast<uint16>(15);
679   const uint16 val = (exp << 10) | man;
680 
681   Eigen::half result;
682   result.x = val;
683   return result - Eigen::half(1.0);
684 }
685 
686 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
687 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)688 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
689   // bfloat are formatted as follows (MSB first):
690   //    sign(1) exponent(8) mantissa(7)
691   // Conceptually construct the following:
692   //    sign == 0
693   //    exponent == 127  -- an excess 127 representation of a zero exponent
694   //    mantissa == 7 random bits
695   const uint16 man = x & 0x7fu;  // 7 bit mantissa
696   const uint16 exp = static_cast<uint16>(127);
697   const uint16 val = (exp << 7) | man;
698 
699   bfloat16 result;
700   memcpy(&result, &val, sizeof(val));
701   // The mantissa has an implicit leading 1, so the above code creates a value
702   // in [1, 2). The minus will not cause a rounding that makes the result 1.
703   // Instead it will just be close to 1.
704   return result - bfloat16(1.0);
705 }
706 
707 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)708 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
709   // IEEE754 floats are formatted as follows (MSB first):
710   //    sign(1) exponent(8) mantissa(23)
711   // Conceptually construct the following:
712   //    sign == 0
713   //    exponent == 127  -- an excess 127 representation of a zero exponent
714   //    mantissa == 23 random bits
715   const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
716   const uint32 exp = static_cast<uint32>(127);
717   const uint32 val = (exp << 23) | man;
718 
719   // Assumes that endian-ness is same for float and uint32.
720   float result;
721   memcpy(&result, &val, sizeof(val));
722   return result - 1.0f;
723 }
724 
725 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)726 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
727   // IEEE754 doubles are formatted as follows (MSB first):
728   //    sign(1) exponent(11) mantissa(52)
729   // Conceptually construct the following:
730   //    sign == 0
731   //    exponent == 1023  -- an excess 1023 representation of a zero exponent
732   //    mantissa == 52 random bits
733   const uint32 mhi = x0 & 0xfffffu;  // upper 20 bits of mantissa
734   const uint32 mlo = x1;             // lower 32 bits of mantissa
735   const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
736   const uint64 exp = static_cast<uint64>(1023);
737   const uint64 val = (exp << 52) | man;
738   // Assumes that endian-ness is same for double and uint64.
739   double result;
740   memcpy(&result, &val, sizeof(val));
741   return result - 1.0;
742 }
743 
744 }  // namespace random
745 }  // namespace tensorflow
746 
747 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
748