1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 #define _USE_MATH_DEFINES
20 #include <math.h>
21 #include <cmath>
22 #undef _USE_MATH_DEFINES
23 
24 #include <string.h>
25 #include <algorithm>
26 #include <type_traits>
27 
28 #include "philox_random.h"
29 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
30 #include "unsupported/Eigen/CXX11/Tensor"
31 
32 namespace tensorflow {
33 namespace random {
34 
35 // Helper function to convert a 16-bit integer to a half between [0..1).
36 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
37 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
38 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
39 // Helper function to convert a 32-bit integer to a float between [0..1).
40 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
41 // Helper function to convert two 32-bit integers to a double between [0..1).
42 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
43 
44 // Computes a + b. Requires that the result is representable in the destination
45 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
46 // need *not* be representable in that type. (The condition on b excludes the
47 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
48 // compute.)
49 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)50 PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) {
51     // Implementation note: both b_div_2 and b - b_div_2 are positive and
52     // representatble as Int.
53     auto b_div_2 = b >> 1;
54     return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
55 }
56 
57 // A class that generates uniform distribution random numbers from the
58 // underlying random integer generator.
59 // Arguments:
60 //   Generator: a generator type that returns a number of uint32 upon each
61 //              invocation. It needs to define kResultElementCount for the
62 //              sample count for each invocation, and ResultType for the
63 //              actual returned sample type.
64 //   RealType: the data type of the real numbers that will be returned by the
65 //             distribution. This could be either float or double for now.
66 // This class is meant to be implemented through specialization. The default
67 // is not defined by design.
68 template <class Generator, typename RealType>
69 class UniformDistribution;
70 
71 template <class Generator>
72 class UniformDistribution<Generator, Eigen::half> {
73    public:
74     // The number of elements that will be returned.
75     static const int kResultElementCount = Generator::kResultElementCount;
76     // Cost of generation of a single element (in cycles).
77     static const int kElementCost = 3;
78     // Indicate that this distribution may take variable number of samples
79     // during the runtime.
80     static const bool kVariableSamplesPerOutput = false;
81     typedef Array<Eigen::half, kResultElementCount> ResultType;
82     typedef Eigen::half ResultElementType;
83 
84     PHILOX_DEVICE_INLINE
operator()85     ResultType operator()(Generator* gen) {
86         typename Generator::ResultType sample = (*gen)();
87         ResultType result;
88         for (int i = 0; i < kResultElementCount; ++i) {
89             result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
90         }
91         return result;
92     }
93 };
94 
95 template <class Generator>
96 class UniformDistribution<Generator, bfloat16> {
97    public:
98     // The number of elements that will be returned.
99     static const int kResultElementCount = Generator::kResultElementCount;
100     // Cost of generation of a single element (in cycles).
101     static const int kElementCost = 3;
102     // Indicate that this distribution may take variable number of samples
103     // during the runtime.
104     static const bool kVariableSamplesPerOutput = false;
105     typedef Array<bfloat16, kResultElementCount> ResultType;
106     typedef bfloat16 ResultElementType;
107 
108     PHILOX_DEVICE_INLINE
operator()109     ResultType operator()(Generator* gen) {
110         typename Generator::ResultType sample = (*gen)();
111         ResultType result;
112         for (int i = 0; i < kResultElementCount; ++i) {
113             result[i] = Uint16ToGfloat16(sample[i]);
114         }
115         return result;
116     }
117 };
118 
119 template <class Generator>
120 class UniformDistribution<Generator, float> {
121    public:
122     // The number of elements that will be returned.
123     static const int kResultElementCount = Generator::kResultElementCount;
124     // Cost of generation of a single element (in cycles).
125     static const int kElementCost = 3;
126     // Indicate that this distribution may take variable number of samples
127     // during the runtime.
128     static const bool kVariableSamplesPerOutput = false;
129     typedef Array<float, kResultElementCount> ResultType;
130     typedef float ResultElementType;
131 
132     PHILOX_DEVICE_INLINE
operator()133     ResultType operator()(Generator* gen) {
134         typename Generator::ResultType sample = (*gen)();
135         ResultType result;
136         for (int i = 0; i < kResultElementCount; ++i) {
137             result[i] = Uint32ToFloat(sample[i]);
138         }
139         return result;
140     }
141 };
142 
143 template <class Generator>
144 class UniformDistribution<Generator, double> {
145    public:
146     // The number of elements that will be returned.
147     static const int kResultElementCount = Generator::kResultElementCount / 2;
148     // Cost of generation of a single element (in cycles).
149     static const int kElementCost = 3;
150     // Indicate that this distribution may take variable number of samples
151     // during the runtime.
152     static const bool kVariableSamplesPerOutput = false;
153     typedef Array<double, kResultElementCount> ResultType;
154     typedef double ResultElementType;
155 
156     PHILOX_DEVICE_INLINE
operator()157     ResultType operator()(Generator* gen) {
158         typename Generator::ResultType sample = (*gen)();
159         ResultType result;
160         for (int i = 0; i < kResultElementCount; ++i) {
161             result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
162         }
163         return result;
164     }
165 };
166 
167 template <class Generator>
168 class UniformDistribution<Generator, int32> {
169    public:
170     // The number of elements that will be returned.
171     static const int kResultElementCount = Generator::kResultElementCount;
172     // Cost of generation of a single element (in cycles).
173     static const int kElementCost = 3;
174     // Indicate that this distribution may take variable number of samples
175     // during the runtime.
176     static const bool kVariableSamplesPerOutput = false;
177     typedef Array<int32, kResultElementCount> ResultType;
178     typedef int32 ResultElementType;
179 
180     // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)181     UniformDistribution(int32 lo, int32 hi)
182         : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
183 
184     PHILOX_DEVICE_INLINE
operator()185     ResultType operator()(Generator* gen) {
186         typename Generator::ResultType sample = (*gen)();
187         ResultType result;
188         for (int i = 0; i < kResultElementCount; ++i) {
189             result[i] = SignedAdd(lo_, sample[i] % range_);
190         }
191         return result;
192     }
193 
194    private:
195     // Note that lo_ is intentionally signed while range_ is intentionally
196     // unsigned.  This is because hi - lo can overflow signed integers if
197     // lo < 0 < hi, but always fits in unsigned.
198     int32 lo_;
199     uint32 range_;
200 };
201 
202 template <class Generator>
203 class UniformDistribution<Generator, int64> {
204    public:
205     // The number of elements that will be returned.
206     static const int kResultElementCount = Generator::kResultElementCount / 2;
207     // Cost of generation of a single element (in cycles).
208     static const int kElementCost = 3;
209     // Indicate that this distribution may take variable number of samples
210     // during the runtime.
211     static const bool kVariableSamplesPerOutput = false;
212     typedef Array<int64, kResultElementCount> ResultType;
213     typedef int64 ResultElementType;
214 
215     // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)216     UniformDistribution(int64 lo, int64 hi)
217         : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
218 
219     PHILOX_DEVICE_INLINE
operator()220     ResultType operator()(Generator* gen) {
221         typename Generator::ResultType sample = (*gen)();
222         ResultType result;
223         for (int i = 0; i < kResultElementCount; ++i) {
224             auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
225             result[i] = SignedAdd(lo_, bits % range_);
226         }
227         return result;
228     }
229 
230    private:
231     // Note that lo_ is intentionally signed while range_ is intentionally
232     // unsigned.  This is because hi - lo can overflow signed integers if
233     // lo < 0 < hi, but always fits in unsigned.
234     int64 lo_;
235     uint64 range_;
236 };
237 
238 // A class that adapts the underlying native multiple samples to return a single
239 // sample at a time.
240 template <class Generator>
241 class SingleSampleAdapter {
242    public:
243     // The number of elements that will be returned.
244     static const int kResultElementCount = 1;
245     // The number of elements that will be returned by the underlying generator.
246     static const int kNativeElementCount = Generator::kResultElementCount;
247     typedef typename Generator::ResultElementType ResultType;
248     typedef typename Generator::ResultElementType ResultElementType;
249 
250     PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)251     explicit SingleSampleAdapter(Generator* gen)
252         : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
253 
254     PHILOX_DEVICE_INLINE
operator()255     ResultType operator()() {
256         if (used_result_index_ == Generator::kResultElementCount) {
257             unused_results_ = (*generator_)();
258             used_result_index_ = 0;
259         }
260 
261         return unused_results_[used_result_index_++];
262     }
263 
264     PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)265     void Skip(uint64 num_skips) {
266         if (!num_skips) {
267             return;
268         }
269         int num_unused_results = kNativeElementCount - used_result_index_;
270         if (num_skips <= num_unused_results) {
271             used_result_index_ += num_skips;
272             return;
273         }
274         num_skips -= num_unused_results;
275         used_result_index_ = kNativeElementCount;
276         SkipFromGenerator(num_skips / kNativeElementCount);
277         num_skips = num_skips % kNativeElementCount;
278         if (num_skips) {
279             unused_results_ = (*generator_)();
280             used_result_index_ = num_skips;
281         }
282     }
283 
284    private:
285     // This implementation iteratively skips over `num_skips` samples
286     // from `generator_`. There is an O(1) implementation for PhiloxRandom
287     // in random_distributions.cc.
288     PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)289     void SkipFromGenerator(uint64 num_skips) {
290         while (num_skips--) {
291             (*generator_)();
292         }
293     }
294 
295     Generator* generator_;
296     typename Generator::ResultType unused_results_;
297     int used_result_index_;
298 };
299 
300 // A class that generates unit normal distribution random numbers from the
301 // underlying random integer generator.
302 // Arguments:
303 //   Generator: a generator type that returns a number of uint32 upon each
304 //              each invocation. It needs to define kResultElementCount for the
305 //              sample count for each invocation, and ResultType for actual
306 //              returned sample type.
307 //   RealType: the data type of the real numbers that will be returned by the
308 //             distribution. This could be either float or double for now.
309 // This class is meant to be implemented through specialization. The default
310 // is not defined by design.
311 template <class Generator, typename RealType>
312 class NormalDistribution;
313 
314 PHILOX_DEVICE_INLINE
315 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
316 
317 PHILOX_DEVICE_INLINE
318 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1);
319 
320 // Exactly like the float version, except that we convert to half afterwards;
321 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
322 // gain from working in half internally.
323 template <class Generator>
324 class NormalDistribution<Generator, Eigen::half> {
325    public:
326     // The number of elements that will be returned.
327     static const int kResultElementCount = Generator::kResultElementCount;
328     // Cost of generation of a single element (in cycles).
329     static const int kElementCost = 70;
330     // Indicate that this distribution may take variable number of samples
331     // during the runtime.
332     static const bool kVariableSamplesPerOutput = false;
333     typedef Array<Eigen::half, kResultElementCount> ResultType;
334     typedef Eigen::half ResultElementType;
335 
336     PHILOX_DEVICE_INLINE
operator()337     ResultType operator()(Generator* gen) {
338         typename Generator::ResultType sample = (*gen)();
339         ResultType result;
340         for (int i = 0; i < kResultElementCount; i += 2) {
341             float f[2];
342             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
343             result[i] = Eigen::half(f[0]);
344             result[i + 1] = Eigen::half(f[1]);
345         }
346         return result;
347     }
348 };
349 
350 template <class Generator>
351 class NormalDistribution<Generator, bfloat16> {
352    public:
353     // The number of elements that will be returned.
354     static const int kResultElementCount = Generator::kResultElementCount;
355     // Cost of generation of a single element (in cycles).
356     static const int kElementCost = 70;
357     // Indicate that this distribution may take variable number of samples
358     // during the runtime.
359     static const bool kVariableSamplesPerOutput = false;
360     typedef Array<bfloat16, kResultElementCount> ResultType;
361     typedef bfloat16 ResultElementType;
362 
363     PHILOX_DEVICE_INLINE
operator()364     ResultType operator()(Generator* gen) {
365         typename Generator::ResultType sample = (*gen)();
366         ResultType result;
367         static_assert(kResultElementCount % 2 == 0, "kResultElementCount should be an even number");
368         for (int i = 0; i < kResultElementCount; i += 2) {
369             float f[2];
370             // Box-Muller transform requires processing 2 elements at a time.
371             BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
372             result[i] = bfloat16(f[0]);
373             result[i + 1] = bfloat16(f[1]);
374         }
375         return result;
376     }
377 };
378 
379 template <class Generator>
380 class NormalDistribution<Generator, float> {
381    public:
382     // The number of elements that will be returned.
383     static const int kResultElementCount = Generator::kResultElementCount;
384     // Cost of generation of a single element (in cycles).
385     static const int kElementCost = 70;
386     // Indicate that this distribution may take variable number of samples
387     // during the runtime.
388     static const bool kVariableSamplesPerOutput = false;
389     typedef Array<float, kResultElementCount> ResultType;
390     typedef float ResultElementType;
391 
392     PHILOX_DEVICE_INLINE
operator()393     ResultType operator()(Generator* gen) {
394         typename Generator::ResultType sample = (*gen)();
395         ResultType result;
396         for (int i = 0; i < kResultElementCount; i += 2) {
397             BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
398         }
399         return result;
400     }
401 };
402 
403 template <class Generator>
404 class NormalDistribution<Generator, double> {
405    public:
406     // The number of elements that will be returned.
407     static const int kResultElementCount = Generator::kResultElementCount / 2;
408     // Cost of generation of a single element (in cycles).
409     static const int kElementCost = 70;
410     // Indicate that this distribution may take variable number of samples
411     // during the runtime.
412     static const bool kVariableSamplesPerOutput = false;
413     typedef Array<double, kResultElementCount> ResultType;
414     typedef double ResultElementType;
415 
416     PHILOX_DEVICE_INLINE
operator()417     ResultType operator()(Generator* gen) {
418         typename Generator::ResultType sample = (*gen)();
419         ResultType result;
420         for (int i = 0; i < kResultElementCount; i += 2) {
421             const int i2 = 2 * i;
422             BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
423                             &result[i + 1]);
424         }
425         return result;
426     }
427 };
428 
429 // A class that returns standard normal distribution between
430 // [-kTruncateValue, kTruncateValue].
431 // Arguments:
432 //   Generator: a generator type that returns a number of uint32 upon each
433 //              each invocation. It needs to define kResultElementCount for the
434 //              sample count for each invocation, and ResultType for actual
435 //              returned sample type.
436 //   RealType: the data type of the real numbers that will be returned by the
437 //             distribution. This could be either float or double for now.
438 // This class is meant to be implemented through specialization. The default
439 // is not defined by design.
440 template <class SingleSampleGenerator, typename RealType>
441 class TruncatedNormalDistribution;
442 
443 // Exactly like the float version, except that we convert to half afterwards;
444 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
445 // gain from working in half internally.
446 template <class SingleSampleGenerator>
447 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
448    public:
449     // The number of elements that will be returned.
450     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
451     // Cost of generation of a single element (in cycles).
452     static const int kElementCost = 90;
453     // Indicate that this distribution may take variable number of samples
454     // during the runtime.
455     static const bool kVariableSamplesPerOutput = true;
456     // The threshold where the normal distribution is truncated.
457     const float kTruncateValue = 2.0f;
458 
459     typedef Array<Eigen::half, kResultElementCount> ResultType;
460     typedef Eigen::half ResultElementType;
461 
462     PHILOX_DEVICE_INLINE
operator()463     ResultType operator()(SingleSampleGenerator* gen) {
464         ResultType results;
465         int index = 0;
466         while (true) {
467             // Repeatedly take samples from the normal distribution, until we have
468             // the desired number of elements that fall within the pre-defined cutoff
469             // threshold.
470             const uint32 x0 = (*gen)();
471             const uint32 x1 = (*gen)();
472             float f[2];
473             BoxMullerFloat(x0, x1, &f[0], &f[1]);
474 
475             for (int i = 0; i < 2; ++i) {
476                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
477                     results[index++] = Eigen::half(f[i]);
478                     if (index >= kResultElementCount) {
479                         return results;
480                     }
481                 }
482             }
483         }
484     }
485 };
486 
487 template <class SingleSampleGenerator>
488 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
489    public:
490     // The number of elements that will be returned.
491     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
492     // Cost of generation of a single element (in cycles).
493     static const int kElementCost = 90;
494     // Indicate that this distribution may take variable number of samples
495     // during the runtime.
496     static const bool kVariableSamplesPerOutput = true;
497     // The threshold where the normal distribution is truncated.
498     const float kTruncateValue = 2.0f;
499 
500     typedef Array<bfloat16, kResultElementCount> ResultType;
501     typedef bfloat16 ResultElementType;
502 
503     PHILOX_DEVICE_INLINE
operator()504     ResultType operator()(SingleSampleGenerator* gen) {
505         ResultType results;
506         int index = 0;
507         while (true) {
508             // Repeatedly take samples from the normal distribution, until we have
509             // the desired number of elements that fall within the pre-defined cutoff
510             // threshold.
511             const uint32 x0 = (*gen)();
512             const uint32 x1 = (*gen)();
513             float f[2];
514             BoxMullerFloat(x0, x1, &f[0], &f[1]);
515 
516             for (int i = 0; i < 2; ++i) {
517                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
518                     results[index++] = bfloat16(f[i]);
519                     if (index >= kResultElementCount) {
520                         return results;
521                     }
522                 }
523             }
524         }
525     }
526 };
527 
528 // Partial specialization for float.
529 template <class SingleSampleGenerator>
530 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
531    public:
532     // The number of elements that will be returned.
533     static const int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
534     // Cost of generation of a single element (in cycles).
535     static const int kElementCost = 90;
536     // Indicate that this distribution may take variable number of samples
537     // during the runtime.
538     static const bool kVariableSamplesPerOutput = true;
539     // The threshold where the normal distribution is truncated.
540     const float kTruncateValue = 2.0f;
541 
542     typedef Array<float, kResultElementCount> ResultType;
543     typedef float ResultElementType;
544 
545     PHILOX_DEVICE_INLINE
operator()546     ResultType operator()(SingleSampleGenerator* gen) {
547         ResultType results;
548         int index = 0;
549         while (true) {
550             // Repeatedly take samples from the normal distribution, until we have
551             // the desired number of elements that fall within the pre-defined cutoff
552             // threshold.
553             const uint32 x0 = (*gen)();
554             const uint32 x1 = (*gen)();
555             float f[2];
556             BoxMullerFloat(x0, x1, &f[0], &f[1]);
557 
558             for (int i = 0; i < 2; ++i) {
559                 if (Eigen::numext::abs(f[i]) < kTruncateValue) {
560                     results[index++] = f[i];
561                     if (index >= kResultElementCount) {
562                         return results;
563                     }
564                 }
565             }
566         }
567     }
568 };
569 
570 // Partial specialization for double.
571 template <class SingleSampleGenerator>
572 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
573    public:
574     // The number of elements that will be returned.
575     static const int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
576                                                    ? SingleSampleGenerator::kNativeElementCount / 2
577                                                    : 1;
578     // Cost of generation of a single element (in cycles).
579     static const int kElementCost = 90;
580     // Indicate that this distribution may take variable number of samples
581     // during the runtime.
582     static const bool kVariableSamplesPerOutput = true;
583     typedef Array<double, kResultElementCount> ResultType;
584     typedef double ResultElementType;
585     const double kTruncateValue = 2.0;
586 
587     PHILOX_DEVICE_INLINE
operator()588     ResultType operator()(SingleSampleGenerator* gen) {
589         ResultType results;
590         int index = 0;
591         while (1) {
592             const uint32 x0 = (*gen)();
593             const uint32 x1 = (*gen)();
594             const uint32 x2 = (*gen)();
595             const uint32 x3 = (*gen)();
596             double d[2];
597             BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
598 
599             for (int i = 0; i < 2; ++i) {
600                 if (Eigen::numext::abs(d[i]) < kTruncateValue) {
601                     results[index++] = d[i];
602                     if (index >= kResultElementCount) {
603                         return results;
604                     }
605                 }
606             }
607         }
608     }
609 };
610 
611 // Helper function to convert two 32-bit uniform integers to two floats
612 // under the unit normal distribution.
613 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)614 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
615     // This function implements the Box-Muller transform:
616     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
617     // Do not send a really small number to log().
618     // We cannot mark "epsilon" as "static const" because NVCC would complain
619     const float epsilon = 1.0e-7f;
620     float u1 = Uint32ToFloat(x0);
621     if (u1 < epsilon) {
622         u1 = epsilon;
623     }
624     const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
625     const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
626 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
627     *f0 = Eigen::numext::sin(v1);
628     *f1 = Eigen::numext::cos(v1);
629 #else
630     sincosf(v1, f0, f1);
631 #endif
632     *f0 *= u2;
633     *f1 *= u2;
634 }
635 
636 // Helper function to convert four 32-bit uniform integers to two doubles
637 // under the unit normal distribution.
638 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)639 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0, double* d1) {
640     // This function implements the Box-Muller transform:
641     // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
642     // Do not send a really small number to log().
643     // We cannot mark "epsilon" as "static const" because NVCC would complain
644     const double epsilon = 1.0e-7;
645     double u1 = Uint64ToDouble(x0, x1);
646     if (u1 < epsilon) {
647         u1 = epsilon;
648     }
649     const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
650     const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
651 #if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
652     *d0 = Eigen::numext::sin(v1);
653     *d1 = Eigen::numext::cos(v1);
654 #else
655     sincos(v1, d0, d1);
656 #endif
657     *d0 *= u2;
658     *d1 *= u2;
659 }
660 
661 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)662 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
663     // IEEE754 halfs are formatted as follows (MSB first):
664     //    sign(1) exponent(5) mantissa(10)
665     // Conceptually construct the following:
666     //    sign == 0
667     //    exponent == 15  -- an excess 15 representation of a zero exponent
668     //    mantissa == 10 random bits
669     const uint16 man = x & 0x3ffu;  // 10 bit mantissa
670     const uint16 exp = static_cast<uint16>(15);
671     const uint16 val = (exp << 10) | man;
672 
673     Eigen::half result;
674     result.x = val;
675     return result - Eigen::half(1.0);
676 }
677 
678 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
679 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)680 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
681     // bfloat are formatted as follows (MSB first):
682     //    sign(1) exponent(8) mantissa(7)
683     // Conceptually construct the following:
684     //    sign == 0
685     //    exponent == 127  -- an excess 127 representation of a zero exponent
686     //    mantissa == 7 random bits
687     const uint16 man = x & 0x7fu;  // 7 bit mantissa
688     const uint16 exp = static_cast<uint16>(127);
689     const uint16 val = (exp << 7) | man;
690 
691     bfloat16 result;
692     memcpy(&result, &val, sizeof(val));
693     // The mantissa has an implicit leading 1, so the above code creates a value
694     // in [1, 2). The minus will not cause a rounding that makes the result 1.
695     // Instead it will just be close to 1.
696     return result - bfloat16(1.0);
697 }
698 
699 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)700 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
701     // IEEE754 floats are formatted as follows (MSB first):
702     //    sign(1) exponent(8) mantissa(23)
703     // Conceptually construct the following:
704     //    sign == 0
705     //    exponent == 127  -- an excess 127 representation of a zero exponent
706     //    mantissa == 23 random bits
707     const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
708     const uint32 exp = static_cast<uint32>(127);
709     const uint32 val = (exp << 23) | man;
710 
711     // Assumes that endian-ness is same for float and uint32.
712     float result;
713     memcpy(&result, &val, sizeof(val));
714     return result - 1.0f;
715 }
716 
717 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)718 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
719     // IEEE754 doubles are formatted as follows (MSB first):
720     //    sign(1) exponent(11) mantissa(52)
721     // Conceptually construct the following:
722     //    sign == 0
723     //    exponent == 1023  -- an excess 1023 representation of a zero exponent
724     //    mantissa == 52 random bits
725     const uint32 mhi = x0 & 0xfffffu;                           // upper 20 bits of mantissa
726     const uint32 mlo = x1;                                      // lower 32 bits of mantissa
727     const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
728     const uint64 exp = static_cast<uint64>(1023);
729     const uint64 val = (exp << 52) | man;
730     // Assumes that endian-ness is same for double and uint64.
731     double result;
732     memcpy(&result, &val, sizeof(val));
733     return result - 1.0;
734 }
735 
736 }  // namespace random
737 }  // namespace tensorflow
738 
739 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
740