1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 // This include is needed in order to prevent a build error when including
20 // tensorflow/core/lib/bfloat16/bfloat16.h.
21 #include <iostream>
22 
23 #define _USE_MATH_DEFINES
24 #include <math.h>
25 
26 #include <cmath>
27 #undef _USE_MATH_DEFINES
28 
29 #include <string.h>
30 #include <tensorflow/core/lib/bfloat16/bfloat16.h>
31 
32 #pragma clang diagnostic push
33 #pragma clang diagnostic ignored "-Wunused-parameter"
34 #pragma clang diagnostic ignored "-Winvalid-partial-specialization"
35 #include <unsupported/Eigen/CXX11/Tensor>
36 #pragma clang diagnostic pop
37 
38 #include <algorithm>
39 #include <type_traits>
40 
41 #include "philox_random.h"
42 
43 namespace tensorflow {
44 namespace random {
45 
46 // Helper function to convert a 16-bit integer to a half between [0..1).
47 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
48 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
49 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
50 // Helper function to convert a 32-bit integer to a float between [0..1).
51 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
52 // Helper function to convert two 32-bit integers to a double between [0..1).
53 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
54 
55 // Computes a + b. Requires that the result is representable in the destination
56 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
57 // need *not* be representable in that type. (The condition on b excludes the
58 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
59 // compute.)
60 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)61 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
62     // Implementation note: both b_div_2 and b - b_div_2 are positive and
63     // representatble as Int.
64     auto b_div_2 = b >> 1;
65     return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
66 }
67 
68 // A class that generates uniform distribution random numbers from the
69 // underlying random integer generator.
70 // Arguments:
71 //   Generator: a generator type that returns a number of uint32 upon each
72 //              invocation. It needs to define kResultElementCount for the
73 //              sample count for each invocation, and ResultType for the
74 //              actual returned sample type.
75 //   RealType: the data type of the real numbers that will be returned by the
76 //             distribution. This could be either float or double for now.
77 // This class is meant to be implemented through specialization. The default
78 // is not defined by design.
79 template <class Generator, typename RealType>
80 class UniformDistribution;
81 
82 template <class Generator>
83 class UniformDistribution<Generator, Eigen::half> {
84    public:
85     // The number of elements that will be returned.
86     static const int kResultElementCount = Generator::kResultElementCount;
87     // Cost of generation of a single element (in cycles).
88     static const int kElementCost = 3;
89     // Indicate that this distribution may take variable number of samples
90     // during the runtime.
91     static const bool kVariableSamplesPerOutput = false;
92     typedef Array<Eigen::half, kResultElementCount> ResultType;
93     typedef Eigen::half ResultElementType;
94 
95     PHILOX_DEVICE_INLINE
operator()96     ResultType operator()(Generator* gen) {
97         typename Generator::ResultType sample = (*gen)();
98         ResultType result;
99         for (int i = 0; i < kResultElementCount; ++i) {
100             result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
101         }
102         return result;
103     }
104 };
105 
106 template <class Generator>
107 class UniformDistribution<Generator, bfloat16> {
108    public:
109     // The number of elements that will be returned.
110     static const int kResultElementCount = Generator::kResultElementCount;
111     // Cost of generation of a single element (in cycles).
112     static const int kElementCost = 3;
113     // Indicate that this distribution may take variable number of samples
114     // during the runtime.
115     static const bool kVariableSamplesPerOutput = false;
116     typedef Array<bfloat16, kResultElementCount> ResultType;
117     typedef bfloat16 ResultElementType;
118 
119     PHILOX_DEVICE_INLINE
operator()120     ResultType operator()(Generator* gen) {
121         typename Generator::ResultType sample = (*gen)();
122         ResultType result;
123         for (int i = 0; i < kResultElementCount; ++i) {
124             result[i] = Uint16ToGfloat16(sample[i]);
125         }
126         return result;
127     }
128 };
129 
130 template <class Generator>
131 class UniformDistribution<Generator, float> {
132    public:
133     // The number of elements that will be returned.
134     static const int kResultElementCount = Generator::kResultElementCount;
135     // Cost of generation of a single element (in cycles).
136     static const int kElementCost = 3;
137     // Indicate that this distribution may take variable number of samples
138     // during the runtime.
139     static const bool kVariableSamplesPerOutput = false;
140     typedef Array<float, kResultElementCount> ResultType;
141     typedef float ResultElementType;
142 
143     PHILOX_DEVICE_INLINE
operator()144     ResultType operator()(Generator* gen) {
145         typename Generator::ResultType sample = (*gen)();
146         ResultType result;
147         for (int i = 0; i < kResultElementCount; ++i) {
148             result[i] = Uint32ToFloat(sample[i]);
149         }
150         return result;
151     }
152 };
153 
154 template <class Generator>
155 class UniformDistribution<Generator, double> {
156    public:
157     // The number of elements that will be returned.
158     static const int kResultElementCount = Generator::kResultElementCount / 2;
159     // Cost of generation of a single element (in cycles).
160     static const int kElementCost = 3;
161     // Indicate that this distribution may take variable number of samples
162     // during the runtime.
163     static const bool kVariableSamplesPerOutput = false;
164     typedef Array<double, kResultElementCount> ResultType;
165     typedef double ResultElementType;
166 
167     PHILOX_DEVICE_INLINE
operator()168     ResultType operator()(Generator* gen) {
169         typename Generator::ResultType sample = (*gen)();
170         ResultType result;
171         for (int i = 0; i < kResultElementCount; ++i) {
172             result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
173         }
174         return result;
175     }
176 };
177 
178 template <class Generator>
179 class UniformDistribution<Generator, int32> {
180    public:
181     // The number of elements that will be returned.
182     static const int kResultElementCount = Generator::kResultElementCount;
183     // Cost of generation of a single element (in cycles).
184     static const int kElementCost = 3;
185     // Indicate that this distribution may take variable number of samples
186     // during the runtime.
187     static const bool kVariableSamplesPerOutput = false;
188     typedef Array<int32, kResultElementCount> ResultType;
189     typedef int32 ResultElementType;
190 
191     // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)192     UniformDistribution(int32 lo, int32 hi)
193         : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
194 
195     PHILOX_DEVICE_INLINE
operator()196     ResultType operator()(Generator* gen) {
197         typename Generator::ResultType sample = (*gen)();
198         ResultType result;
199         for (int i = 0; i < kResultElementCount; ++i) {
200             result[i] = SignedAdd(lo_, sample[i] % range_);
201         }
202         return result;
203     }
204 
205    private:
206     // Note that lo_ is intentionally signed while range_ is intentionally
207     // unsigned.  This is because hi - lo can overflow signed integers if
208     // lo < 0 < hi, but always fits in unsigned.
209     int32 lo_;
210     uint32 range_;
211 };
212 
213 template <class Generator>
214 class UniformDistribution<Generator, int64> {
215    public:
216     // The number of elements that will be returned.
217     static const int kResultElementCount = Generator::kResultElementCount / 2;
218     // Cost of generation of a single element (in cycles).
219     static const int kElementCost = 3;
220     // Indicate that this distribution may take variable number of samples
221     // during the runtime.
222     static const bool kVariableSamplesPerOutput = false;
223     typedef Array<int64, kResultElementCount> ResultType;
224     typedef int64 ResultElementType;
225 
226     // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)227     UniformDistribution(int64 lo, int64 hi)
228         : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
229 
230     PHILOX_DEVICE_INLINE
operator()231     ResultType operator()(Generator* gen) {
232         typename Generator::ResultType sample = (*gen)();
233         ResultType result;
234         for (int i = 0; i < kResultElementCount; ++i) {
235             auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
236             result[i] = SignedAdd(lo_, bits % range_);
237         }
238         return result;
239     }
240 
241    private:
242     // Note that lo_ is intentionally signed while range_ is intentionally
243     // unsigned.  This is because hi - lo can overflow signed integers if
244     // lo < 0 < hi, but always fits in unsigned.
245     int64 lo_;
246     uint64 range_;
247 };
248 
249 // A class that adapts the underlying native multiple samples to return a single
250 // sample at a time.
251 template <class Generator>
252 class SingleSampleAdapter {
253    public:
254     // The number of elements that will be returned.
255     static const int kResultElementCount = 1;
256     // The number of elements that will be returned by the underlying generator.
257     static const int kNativeElementCount = Generator::kResultElementCount;
258     typedef typename Generator::ResultElementType ResultType;
259     typedef typename Generator::ResultElementType ResultElementType;
260 
261     PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)262     explicit SingleSampleAdapter(Generator* gen)
263         : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
264 
265     PHILOX_DEVICE_INLINE
operator()266     ResultType operator()() {
267         if (used_result_index_ == Generator::kResultElementCount) {
268             unused_results_ = (*generator_)();
269             used_result_index_ = 0;
270         }
271 
272         return unused_results_[used_result_index_++];
273     }
274 
275     PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)276     void Skip(uint64 num_skips) {
277         if (!num_skips) {
278             return;
279         }
280         int num_unused_results = kNativeElementCount - used_result_index_;
281         if (num_skips <= num_unused_results) {
282             used_result_index_ += num_skips;
283             return;
284         }
285         num_skips -= num_unused_results;
286         used_result_index_ = kNativeElementCount;
287         SkipFromGenerator(num_skips / kNativeElementCount);
288         num_skips = num_skips % kNativeElementCount;
289         if (num_skips) {
290             unused_results_ = (*generator_)();
291             used_result_index_ = num_skips;
292         }
293     }
294 
295    private:
296     // This implementation iteratively skips over `num_skips` samples
297     // from `generator_`. There is an O(1) implementation for PhiloxRandom
298     // in random_distributions.cc.
299     PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)300     void SkipFromGenerator(uint64 num_skips) {
301         while (num_skips--) {
302             (*generator_)();
303         }
304     }
305 
306     Generator* generator_;
307     typename Generator::ResultType unused_results_;
308     int used_result_index_;
309 };
310 
311 // A class that generates unit normal distribution random numbers from the
312 // underlying random integer generator.
313 // Arguments:
314 //   Generator: a generator type that returns a number of uint32 upon each
315 //              each invocation. It needs to define kResultElementCount for the
316 //              sample count for each invocation, and ResultType for actual
317 //              returned sample type.
318 //   RealType: the data type of the real numbers that will be returned by the
319 //             distribution. This could be either float or double for now.
320 // This class is meant to be implemented through specialization. The default
321 // is not defined by design.
322 template <class Generator, typename RealType>
323 class NormalDistribution;
324 
325 PHILOX_DEVICE_INLINE
326 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
327 
328 PHILOX_DEVICE_INLINE
329 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
330 
331 // Exactly like the float version, except that we convert to half afterwards;
332 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
333 // gain from working in half internally.
334 template <class Generator>
335 class NormalDistribution<Generator, Eigen::half> {
336    public:
337     // The number of elements that will be returned.
338     static const int kResultElementCount = Generator::kResultElementCount;
339     // Cost of generation of a single element (in cycles).
340     static const int kElementCost = 70;
341     // Indicate that this distribution may take variable number of samples
342     // during the runtime.
343     static const bool kVariableSamplesPerOutput = false;
344     typedef Array<Eigen::half, kResultElementCount> ResultType;
345     typedef Eigen::half ResultElementType;
346 
347     PHILOX_DEVICE_INLINE
operator()348     ResultType operator()(Generator* gen) {
349         typename Generator::ResultType sample = (*gen)();
350         ResultType result;
351         for (int i = 0; i < kResultElementCount; i += 2) {
352             float f[2];
353             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
354             result[i] = Eigen::half(f[0]);
355             result[i + 1] = Eigen::half(f[1]);
356         }
357         return result;
358     }
359 };
360 
361 template <class Generator>
362 class NormalDistribution<Generator, bfloat16> {
363    public:
364     // The number of elements that will be returned.
365     static const int kResultElementCount = Generator::kResultElementCount;
366     // Cost of generation of a single element (in cycles).
367     static const int kElementCost = 70;
368     // Indicate that this distribution may take variable number of samples
369     // during the runtime.
370     static const bool kVariableSamplesPerOutput = false;
371     typedef Array<bfloat16, kResultElementCount> ResultType;
372     typedef bfloat16 ResultElementType;
373 
374     PHILOX_DEVICE_INLINE
operator()375     ResultType operator()(Generator* gen) {
376         typename Generator::ResultType sample = (*gen)();
377         ResultType result;
378         static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
379         for (int i = 0; i < kResultElementCount; i += 2) {
380             float f[2];
381             // Box-Muller transform requires processing 2 elements at a time.
382             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
383             result[i] = bfloat16(f[0]);
384             result[i + 1] = bfloat16(f[1]);
385         }
386         return result;
387     }
388 };
389 
390 template <class Generator>
391 class NormalDistribution<Generator, float> {
392    public:
393     // The number of elements that will be returned.
394     static const int kResultElementCount = Generator::kResultElementCount;
395     // Cost of generation of a single element (in cycles).
396     static const int kElementCost = 70;
397     // Indicate that this distribution may take variable number of samples
398     // during the runtime.
399     static const bool kVariableSamplesPerOutput = false;
400     typedef Array<float, kResultElementCount> ResultType;
401     typedef float ResultElementType;
402 
403     PHILOX_DEVICE_INLINE
operator()404     ResultType operator()(Generator* gen) {
405         typename Generator::ResultType sample = (*gen)();
406         ResultType result;
407         for (int i = 0; i < kResultElementCount; i += 2) {
408             BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
409         }
410         return result;
411     }
412 };
413 
414 template <class Generator>
415 class NormalDistribution<Generator, double> {
416    public:
417     // The number of elements that will be returned.
418     static const int kResultElementCount = Generator::kResultElementCount / 2;
419     // Cost of generation of a single element (in cycles).
420     static const int kElementCost = 70;
421     // Indicate that this distribution may take variable number of samples
422     // during the runtime.
423     static const bool kVariableSamplesPerOutput = false;
424     typedef Array<double, kResultElementCount> ResultType;
425     typedef double ResultElementType;
426 
427     PHILOX_DEVICE_INLINE
operator()428     ResultType operator()(Generator* gen) {
429         typename Generator::ResultType sample = (*gen)();
430         ResultType result;
431         for (int i = 0; i < kResultElementCount; i += 2) {
432             const int i2 = 2 * i;
433             BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
434                             &result[i + 1]);
435         }
436         return result;
437     }
438 };
439 
440 // A class that returns standard normal distribution between
441 // [-kTruncateValue, kTruncateValue].
442 // Arguments:
443 //   Generator: a generator type that returns a number of uint32 upon each
444 //              each invocation. It needs to define kResultElementCount for the
445 //              sample count for each invocation, and ResultType for actual
446 //              returned sample type.
447 //   RealType: the data type of the real numbers that will be returned by the
448 //             distribution. This could be either float or double for now.
449 // This class is meant to be implemented through specialization. The default
450 // is not defined by design.
451 template <class SingleSampleGenerator, typename RealType>
452 class TruncatedNormalDistribution;
453 
454 // Exactly like the float version, except that we convert to half afterwards;
455 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
456 // gain from working in half internally.
457 template <class SingleSampleGenerator>
458 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
459    public:
460     // The number of elements that will be returned.
461     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
462     // Cost of generation of a single element (in cycles).
463     static const int kElementCost = 90;
464     // Indicate that this distribution may take variable number of samples
465     // during the runtime.
466     static const bool kVariableSamplesPerOutput = true;
467     // The threshold where the normal distribution is truncated.
468     const float kTruncateValue = 2.0f;
469 
470     typedef Array<Eigen::half, kResultElementCount> ResultType;
471     typedef Eigen::half ResultElementType;
472 
473     PHILOX_DEVICE_INLINE
operator()474     ResultType operator()(SingleSampleGenerator* gen) {
475         ResultType results;
476         int index = 0;
477         while (true) {
478             // Repeatedly take samples from the normal distribution, until we have
479             // the desired number of elements that fall within the pre-defined cutoff
480             // threshold.
481             const uint32 x0 = (*gen)();
482             const uint32 x1 = (*gen)();
483             float f[2];
484             BoxMullerFloat(x0, x1, &f[0], &f[1]);
485 
486             for (int i = 0; i < 2; ++i) {
487                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
488                     results[index++] = Eigen::half(f[i]);
489                     if (index >= kResultElementCount) {
490                         return results;
491                     }
492                 }
493             }
494         }
495     }
496 };
497 
498 template <class SingleSampleGenerator>
499 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
500    public:
501     // The number of elements that will be returned.
502     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
503     // Cost of generation of a single element (in cycles).
504     static const int kElementCost = 90;
505     // Indicate that this distribution may take variable number of samples
506     // during the runtime.
507     static const bool kVariableSamplesPerOutput = true;
508     // The threshold where the normal distribution is truncated.
509     const float kTruncateValue = 2.0f;
510 
511     typedef Array<bfloat16, kResultElementCount> ResultType;
512     typedef bfloat16 ResultElementType;
513 
514     PHILOX_DEVICE_INLINE
operator()515     ResultType operator()(SingleSampleGenerator* gen) {
516         ResultType results;
517         int index = 0;
518         while (true) {
519             // Repeatedly take samples from the normal distribution, until we have
520             // the desired number of elements that fall within the pre-defined cutoff
521             // threshold.
522             const uint32 x0 = (*gen)();
523             const uint32 x1 = (*gen)();
524             float f[2];
525             BoxMullerFloat(x0, x1, &f[0], &f[1]);
526 
527             for (int i = 0; i < 2; ++i) {
528                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
529                     results[index++] = bfloat16(f[i]);
530                     if (index >= kResultElementCount) {
531                         return results;
532                     }
533                 }
534             }
535         }
536     }
537 };
538 
539 // Partial specialization for float.
540 template <class SingleSampleGenerator>
541 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
542    public:
543     // The number of elements that will be returned.
544     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
545     // Cost of generation of a single element (in cycles).
546     static const int kElementCost = 90;
547     // Indicate that this distribution may take variable number of samples
548     // during the runtime.
549     static const bool kVariableSamplesPerOutput = true;
550     // The threshold where the normal distribution is truncated.
551     const float kTruncateValue = 2.0f;
552 
553     typedef Array<float, kResultElementCount> ResultType;
554     typedef float ResultElementType;
555 
556     PHILOX_DEVICE_INLINE
operator()557     ResultType operator()(SingleSampleGenerator* gen) {
558         ResultType results;
559         int index = 0;
560         while (true) {
561             // Repeatedly take samples from the normal distribution, until we have
562             // the desired number of elements that fall within the pre-defined cutoff
563             // threshold.
564             const uint32 x0 = (*gen)();
565             const uint32 x1 = (*gen)();
566             float f[2];
567             BoxMullerFloat(x0, x1, &f[0], &f[1]);
568 
569             for (int i = 0; i < 2; ++i) {
570                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
571                     results[index++] = f[i];
572                     if (index >= kResultElementCount) {
573                         return results;
574                     }
575                 }
576             }
577         }
578     }
579 };
580 
581 // Partial specialization for double.
582 template <class SingleSampleGenerator>
583 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
584    public:
585     // The number of elements that will be returned.
586     static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
587                                                    ? SingleSampleGenerator::kNativeElementCount / 2
588                                                    : 1;
589     // Cost of generation of a single element (in cycles).
590     static const int kElementCost = 90;
591     // Indicate that this distribution may take variable number of samples
592     // during the runtime.
593     static const bool kVariableSamplesPerOutput = true;
594     typedef Array<double, kResultElementCount> ResultType;
595     typedef double ResultElementType;
596     const double kTruncateValue = 2.0;
597 
598     PHILOX_DEVICE_INLINE
operator()599     ResultType operator()(SingleSampleGenerator* gen) {
600         ResultType results;
601         int index = 0;
602         while (1) {
603             const uint32 x0 = (*gen)();
604             const uint32 x1 = (*gen)();
605             const uint32 x2 = (*gen)();
606             const uint32 x3 = (*gen)();
607             double d[2];
608             BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
609 
610             for (int i = 0; i < 2; ++i) {
611                 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
612                     results[index++] = d[i];
613                     if (index >= kResultElementCount) {
614                         return results;
615                     }
616                 }
617             }
618         }
619     }
620 };
621 
622 // Helper function to convert two 32-bit uniform integers to two floats
623 // under the unit normal distribution.
624 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)625 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
626     // This function implements the Box-Muller transform:
627     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
628     // Do not send a really small number to log().
629     // We cannot mark "epsilon" as "static const" because NVCC would complain
630     const float epsilon = 1.0e-7f;
631     float u1 = Uint32ToFloat(x0);
632     if (u1 < epsilon) {
633         u1 = epsilon;
634     }
635     const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
636     const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
637 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
638     *f0 = Eigen::numext::sin(v1);
639     *f1 = Eigen::numext::cos(v1);
640 #else
641     sincosf(v1, f0, f1);
642 #endif
643     *f0 *= u2;
644     *f1 *= u2;
645 }
646 
647 // Helper function to convert four 32-bit uniform integers to two doubles
648 // under the unit normal distribution.
649 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)650 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
651     // This function implements the Box-Muller transform:
652     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
653     // Do not send a really small number to log().
654     // We cannot mark "epsilon" as "static const" because NVCC would complain
655     const double epsilon = 1.0e-7;
656     double u1 = Uint64ToDouble(x0, x1);
657     if (u1 < epsilon) {
658         u1 = epsilon;
659     }
660     const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
661     const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
662 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
663     *d0 = Eigen::numext::sin(v1);
664     *d1 = Eigen::numext::cos(v1);
665 #else
666     sincos(v1, d0, d1);
667 #endif
668     *d0 *= u2;
669     *d1 *= u2;
670 }
671 
672 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)673 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
674     // IEEE754 halfs are formatted as follows (MSB first):
675     //    sign(1) exponent(5) mantissa(10)
676     // Conceptually construct the following:
677     //    sign == 0
678     //    exponent == 15  -- an excess 15 representation of a zero exponent
679     //    mantissa == 10 random bits
680     const uint16 man = x & 0x3ffu;  // 10 bit mantissa
681     const uint16 exp = static_cast<uint16>(15);
682     const uint16 val = (exp << 10) | man;
683 
684     Eigen::half result;
685     result.x = val;
686     return result - Eigen::half(1.0);
687 }
688 
689 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
690 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)691 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
692     // bfloat are formatted as follows (MSB first):
693     //    sign(1) exponent(8) mantissa(7)
694     // Conceptually construct the following:
695     //    sign == 0
696     //    exponent == 127  -- an excess 127 representation of a zero exponent
697     //    mantissa == 7 random bits
698     const uint16 man = x & 0x7fu;  // 7 bit mantissa
699     const uint16 exp = static_cast<uint16>(127);
700     const uint16 val = (exp << 7) | man;
701 
702     bfloat16 result;
703     memcpy(&result, &val, sizeof(val));
704     // The mantissa has an implicit leading 1, so the above code creates a value
705     // in [1, 2). The minus will not cause a rounding that makes the result 1.
706     // Instead it will just be close to 1.
707     return result - bfloat16(1.0);
708 }
709 
710 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)711 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
712     // IEEE754 floats are formatted as follows (MSB first):
713     //    sign(1) exponent(8) mantissa(23)
714     // Conceptually construct the following:
715     //    sign == 0
716     //    exponent == 127  -- an excess 127 representation of a zero exponent
717     //    mantissa == 23 random bits
718     const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
719     const uint32 exp = static_cast<uint32>(127);
720     const uint32 val = (exp << 23) | man;
721 
722     // Assumes that endian-ness is same for float and uint32.
723     float result;
724     memcpy(&result, &val, sizeof(val));
725     return result - 1.0f;
726 }
727 
728 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)729 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
730     // IEEE754 doubles are formatted as follows (MSB first):
731     //    sign(1) exponent(11) mantissa(52)
732     // Conceptually construct the following:
733     //    sign == 0
734     //    exponent == 1023  -- an excess 1023 representation of a zero exponent
735     //    mantissa == 52 random bits
736     const uint32 mhi = x0 & 0xfffffu;                           // upper 20 bits of mantissa
737     const uint32 mlo = x1;                                      // lower 32 bits of mantissa
738     const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
739     const uint64 exp = static_cast<uint64>(1023);
740     const uint64 val = (exp << 52) | man;
741     // Assumes that endian-ness is same for double and uint64.
742     double result;
743     memcpy(&result, &val, sizeof(val));
744     return result - 1.0;
745 }
746 
747 }  // namespace random
748 }  // namespace tensorflow
749 
750 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
751