1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
17 #define TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
18 
19 #include <string.h>
20 
21 #include <algorithm>
22 #include <cmath>
23 #include <type_traits>
24 
25 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26 #include "tensorflow/core/lib/random/philox_random.h"
27 #include "tensorflow/core/platform/types.h"
28 
29 namespace tensorflow {
30 namespace random {
31 
32 // Helper function to convert a 16-bit integer to a half between [0..1).
33 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
34 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
35 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
36 // Helper function to convert a 32-bit integer to a float between [0..1).
37 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x);
38 // Helper function to convert two 32-bit integers to a double between [0..1).
39 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1);
40 
41 // Computes a + b. Requires that the result is representable in the destination
42 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
43 // need *not* be representable in that type. (The condition on b excludes the
44 // extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
45 // compute.)
46 template <typename Int>
SignedAdd(Int a,typename std::make_unsigned<Int>::type b)47 PHILOX_DEVICE_INLINE Int SignedAdd(Int a,
48                                    typename std::make_unsigned<Int>::type b) {
49   // Implementation note: both b_div_2 and b - b_div_2 are positive and
50   // representable as Int.
51   auto b_div_2 = b >> 1;
52   return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
53 }
54 
55 // A class that generates uniform distribution random numbers from the
56 // underlying random integer generator.
57 // Arguments:
58 //   Generator: a generator type that returns a number of uint32 upon each
59 //              invocation. It needs to define kResultElementCount for the
60 //              sample count for each invocation, and ResultType for the
61 //              actual returned sample type.
62 //   RealType: the data type of the real numbers that will be returned by the
63 //             distribution. This could be either float or double for now.
64 // This class is meant to be implemented through specialization. The default
65 // is not defined by design.
66 template <class Generator, typename RealType>
67 class UniformDistribution;
68 
69 template <class Generator>
70 class UniformDistribution<Generator, Eigen::half> {
71  public:
72   // The number of elements that will be returned.
73   static constexpr int kResultElementCount = Generator::kResultElementCount;
74   // Cost of generation of a single element (in cycles).
75   static constexpr int kElementCost = 3;
76   // Indicate that this distribution may take variable number of samples
77   // during the runtime.
78   static constexpr bool kVariableSamplesPerOutput = false;
79   typedef Array<Eigen::half, kResultElementCount> ResultType;
80   typedef Eigen::half ResultElementType;
81 
82   PHILOX_DEVICE_INLINE
operator()83   ResultType operator()(Generator* gen) {
84     typename Generator::ResultType sample = (*gen)();
85     ResultType result;
86     for (int i = 0; i < kResultElementCount; ++i) {
87       result[i] = Uint16ToHalf(sample[i]);  // Truncate the upper 16 bits.
88     }
89     return result;
90   }
91 };
92 
93 template <class Generator>
94 class UniformDistribution<Generator, bfloat16> {
95  public:
96   // The number of elements that will be returned.
97   static constexpr int kResultElementCount = Generator::kResultElementCount;
98   // Cost of generation of a single element (in cycles).
99   static constexpr int kElementCost = 3;
100   // Indicate that this distribution may take variable number of samples
101   // during the runtime.
102   static constexpr bool kVariableSamplesPerOutput = false;
103   typedef Array<bfloat16, kResultElementCount> ResultType;
104   typedef bfloat16 ResultElementType;
105 
106   PHILOX_DEVICE_INLINE
operator()107   ResultType operator()(Generator* gen) {
108     typename Generator::ResultType sample = (*gen)();
109     ResultType result;
110     for (int i = 0; i < kResultElementCount; ++i) {
111       result[i] = Uint16ToGfloat16(sample[i]);
112     }
113     return result;
114   }
115 };
116 
117 template <class Generator>
118 class UniformDistribution<Generator, float> {
119  public:
120   // The number of elements that will be returned.
121   static constexpr int kResultElementCount = Generator::kResultElementCount;
122   // Cost of generation of a single element (in cycles).
123   static constexpr int kElementCost = 3;
124   // Indicate that this distribution may take variable number of samples
125   // during the runtime.
126   static constexpr bool kVariableSamplesPerOutput = false;
127   typedef Array<float, kResultElementCount> ResultType;
128   typedef float ResultElementType;
129 
130   PHILOX_DEVICE_INLINE
operator()131   ResultType operator()(Generator* gen) {
132     typename Generator::ResultType sample = (*gen)();
133     ResultType result;
134     for (int i = 0; i < kResultElementCount; ++i) {
135       result[i] = Uint32ToFloat(sample[i]);
136     }
137     return result;
138   }
139 };
140 
141 template <class Generator>
142 class UniformDistribution<Generator, double> {
143  public:
144   // The number of elements that will be returned.
145   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
146   // Cost of generation of a single element (in cycles).
147   static constexpr int kElementCost = 3;
148   // Indicate that this distribution may take variable number of samples
149   // during the runtime.
150   static constexpr bool kVariableSamplesPerOutput = false;
151   typedef Array<double, kResultElementCount> ResultType;
152   typedef double ResultElementType;
153 
154   PHILOX_DEVICE_INLINE
operator()155   ResultType operator()(Generator* gen) {
156     typename Generator::ResultType sample = (*gen)();
157     ResultType result;
158     for (int i = 0; i < kResultElementCount; ++i) {
159       result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
160     }
161     return result;
162   }
163 };
164 
165 template <class Generator>
166 class UniformDistribution<Generator, int32> {
167  public:
168   // The number of elements that will be returned.
169   static constexpr int kResultElementCount = Generator::kResultElementCount;
170   // Cost of generation of a single element (in cycles).
171   static constexpr int kElementCost = 3;
172   // Indicate that this distribution may take variable number of samples
173   // during the runtime.
174   static constexpr bool kVariableSamplesPerOutput = false;
175   typedef Array<int32, kResultElementCount> ResultType;
176   typedef int32 ResultElementType;
177 
178   // Must have lo < hi
UniformDistribution(int32 lo,int32 hi)179   UniformDistribution(int32 lo, int32 hi)
180       : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
181 
182   PHILOX_DEVICE_INLINE
operator()183   ResultType operator()(Generator* gen) {
184     typename Generator::ResultType sample = (*gen)();
185     ResultType result;
186     for (int i = 0; i < kResultElementCount; ++i) {
187       result[i] = SignedAdd(lo_, sample[i] % range_);
188     }
189     return result;
190   }
191 
192  private:
193   // Note that lo_ is intentionally signed while range_ is intentionally
194   // unsigned.  This is because hi - lo can overflow signed integers if
195   // lo < 0 < hi, but always fits in unsigned.
196   int32 lo_;
197   uint32 range_;
198 };
199 
200 template <class Generator>
201 class UniformDistribution<Generator, int64> {
202  public:
203   // The number of elements that will be returned.
204   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
205   // Cost of generation of a single element (in cycles).
206   static constexpr int kElementCost = 3;
207   // Indicate that this distribution may take variable number of samples
208   // during the runtime.
209   static constexpr bool kVariableSamplesPerOutput = false;
210   typedef Array<int64, kResultElementCount> ResultType;
211   typedef int64 ResultElementType;
212 
213   // Must have lo < hi
UniformDistribution(int64 lo,int64 hi)214   UniformDistribution(int64 lo, int64 hi)
215       : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
216 
217   PHILOX_DEVICE_INLINE
operator()218   ResultType operator()(Generator* gen) {
219     typename Generator::ResultType sample = (*gen)();
220     ResultType result;
221     for (int i = 0; i < kResultElementCount; ++i) {
222       auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
223       result[i] = SignedAdd(lo_, bits % range_);
224     }
225     return result;
226   }
227 
228  private:
229   // Note that lo_ is intentionally signed while range_ is intentionally
230   // unsigned.  This is because hi - lo can overflow signed integers if
231   // lo < 0 < hi, but always fits in unsigned.
232   int64 lo_;
233   uint64 range_;
234 };
235 
236 // Similar to `UniformDistribution`, except that instead of generating numbers
237 // in the range [low, high), it generates numbers covering the whole range of
238 // the integer type.
239 template <typename Generator, typename IntType>
240 class UniformFullIntDistribution;
241 
242 template <typename Generator, typename IntType>
243 class UniformFullIntDistribution32 {
244  public:
245   // The number of elements that will be returned.
246   static constexpr int kResultElementCount = Generator::kResultElementCount;
247   // Cost of generation of a single element (in cycles).
248   static constexpr int kElementCost = 3;
249   // Indicate that this distribution may take variable number of samples
250   // during the runtime.
251   static constexpr bool kVariableSamplesPerOutput = false;
252   typedef Array<IntType, kResultElementCount> ResultType;
253   typedef IntType ResultElementType;
254 
255   PHILOX_DEVICE_INLINE
operator()256   ResultType operator()(Generator* gen) {
257     typename Generator::ResultType sample = (*gen)();
258     ResultType result;
259     for (int i = 0; i < kResultElementCount; ++i) {
260       result[i] = sample[i];
261     }
262     return result;
263   }
264 };
265 
266 template <typename Generator, typename IntType>
267 class UniformFullIntDistribution64 {
268  public:
269   // The number of elements that will be returned.
270   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
271   // Cost of generation of a single element (in cycles).
272   static constexpr int kElementCost = 3;
273   // Indicate that this distribution may take variable number of samples
274   // during the runtime.
275   static constexpr bool kVariableSamplesPerOutput = false;
276   typedef Array<IntType, kResultElementCount> ResultType;
277   typedef IntType ResultElementType;
278 
279   PHILOX_DEVICE_INLINE
operator()280   ResultType operator()(Generator* gen) {
281     typename Generator::ResultType sample = (*gen)();
282     ResultType result;
283     for (int i = 0; i < kResultElementCount; ++i) {
284       result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
285     }
286     return result;
287   }
288 };
289 
290 template <typename Generator>
291 class UniformFullIntDistribution<Generator, int32>
292     : public UniformFullIntDistribution32<Generator, int32> {};
293 template <typename Generator>
294 class UniformFullIntDistribution<Generator, uint32>
295     : public UniformFullIntDistribution32<Generator, uint32> {};
296 template <typename Generator>
297 class UniformFullIntDistribution<Generator, int64>
298     : public UniformFullIntDistribution64<Generator, int64> {};
299 template <typename Generator>
300 class UniformFullIntDistribution<Generator, uint64>
301     : public UniformFullIntDistribution64<Generator, uint64> {};
302 
303 // A class that adapts the underlying native multiple samples to return a single
304 // sample at a time.
305 template <class Generator>
306 class SingleSampleAdapter {
307  public:
308   // The number of elements that will be returned.
309   static constexpr int kResultElementCount = 1;
310   // The number of elements that will be returned by the underlying generator.
311   static constexpr int kNativeElementCount = Generator::kResultElementCount;
312   typedef typename Generator::ResultElementType ResultType;
313   typedef typename Generator::ResultElementType ResultElementType;
314 
315   PHILOX_DEVICE_INLINE
SingleSampleAdapter(Generator * gen)316   explicit SingleSampleAdapter(Generator* gen)
317       : generator_(gen), used_result_index_(Generator::kResultElementCount) {}
318 
319   PHILOX_DEVICE_INLINE
operator()320   ResultType operator()() {
321     if (used_result_index_ == Generator::kResultElementCount) {
322       unused_results_ = (*generator_)();
323       used_result_index_ = 0;
324     }
325 
326     return unused_results_[used_result_index_++];
327   }
328 
329   PHILOX_DEVICE_INLINE
Skip(uint64 num_skips)330   void Skip(uint64 num_skips) {
331     if (!num_skips) {
332       return;
333     }
334     int num_unused_results = kNativeElementCount - used_result_index_;
335     if (num_skips <= num_unused_results) {
336       used_result_index_ += num_skips;
337       return;
338     }
339     num_skips -= num_unused_results;
340     used_result_index_ = kNativeElementCount;
341     SkipFromGenerator(num_skips / kNativeElementCount);
342     num_skips = num_skips % kNativeElementCount;
343     if (num_skips) {
344       unused_results_ = (*generator_)();
345       used_result_index_ = num_skips;
346     }
347   }
348 
349  private:
350   // This implementation iteratively skips over `num_skips` samples
351   // from `generator_`. There is an O(1) implementation for PhiloxRandom
352   // in random_distributions.cc.
353   PHILOX_DEVICE_INLINE
SkipFromGenerator(uint64 num_skips)354   void SkipFromGenerator(uint64 num_skips) {
355     while (num_skips--) {
356       (*generator_)();
357     }
358   }
359 
360   Generator* generator_;
361   typename Generator::ResultType unused_results_;
362   int used_result_index_;
363 };
364 
365 // A class that generates unit normal distribution random numbers from the
366 // underlying random integer generator.
367 // Arguments:
368 //   Generator: a generator type that returns a number of uint32 upon each
369 //              each invocation. It needs to define kResultElementCount for the
370 //              sample count for each invocation, and ResultType for actual
371 //              returned sample type.
372 //   RealType: the data type of the real numbers that will be returned by the
373 //             distribution. This could be either float or double for now.
374 // This class is meant to be implemented through specialization. The default
375 // is not defined by design.
376 template <class Generator, typename RealType>
377 class NormalDistribution;
378 
379 PHILOX_DEVICE_INLINE
380 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1);
381 
382 PHILOX_DEVICE_INLINE
383 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
384                      double* d1);
385 
386 // Exactly like the float version, except that we convert to half afterwards;
387 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
388 // gain from working in half internally.
389 template <class Generator>
390 class NormalDistribution<Generator, Eigen::half> {
391  public:
392   // The number of elements that will be returned.
393   static constexpr int kResultElementCount = Generator::kResultElementCount;
394   // Cost of generation of a single element (in cycles).
395   static constexpr int kElementCost = 70;
396   // Indicate that this distribution may take variable number of samples
397   // during the runtime.
398   static constexpr bool kVariableSamplesPerOutput = false;
399   typedef Array<Eigen::half, kResultElementCount> ResultType;
400   typedef Eigen::half ResultElementType;
401 
402   PHILOX_DEVICE_INLINE
operator()403   ResultType operator()(Generator* gen) {
404     typename Generator::ResultType sample = (*gen)();
405     ResultType result;
406     for (int i = 0; i < kResultElementCount; i += 2) {
407       float f[2];
408       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
409       result[i] = Eigen::half(f[0]);
410       result[i + 1] = Eigen::half(f[1]);
411     }
412     return result;
413   }
414 };
415 
416 template <class Generator>
417 class NormalDistribution<Generator, bfloat16> {
418  public:
419   // The number of elements that will be returned.
420   static constexpr int kResultElementCount = Generator::kResultElementCount;
421   // Cost of generation of a single element (in cycles).
422   static constexpr int kElementCost = 70;
423   // Indicate that this distribution may take variable number of samples
424   // during the runtime.
425   static constexpr bool kVariableSamplesPerOutput = false;
426   typedef Array<bfloat16, kResultElementCount> ResultType;
427   typedef bfloat16 ResultElementType;
428 
429   PHILOX_DEVICE_INLINE
operator()430   ResultType operator()(Generator* gen) {
431     typename Generator::ResultType sample = (*gen)();
432     ResultType result;
433     static_assert(kResultElementCount % 2 == 0,
434                   "kResultElementCount should be an even number");
435     for (int i = 0; i < kResultElementCount; i += 2) {
436       float f[2];
437       // Box-Muller transform requires processing 2 elements at a time.
438       BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
439       result[i] = bfloat16(f[0]);
440       result[i + 1] = bfloat16(f[1]);
441     }
442     return result;
443   }
444 };
445 
446 template <class Generator>
447 class NormalDistribution<Generator, float> {
448  public:
449   // The number of elements that will be returned.
450   static constexpr int kResultElementCount = Generator::kResultElementCount;
451   // Cost of generation of a single element (in cycles).
452   static constexpr int kElementCost = 70;
453   // Indicate that this distribution may take variable number of samples
454   // during the runtime.
455   static constexpr bool kVariableSamplesPerOutput = false;
456   typedef Array<float, kResultElementCount> ResultType;
457   typedef float ResultElementType;
458 
459   PHILOX_DEVICE_INLINE
operator()460   ResultType operator()(Generator* gen) {
461     typename Generator::ResultType sample = (*gen)();
462     ResultType result;
463     for (int i = 0; i < kResultElementCount; i += 2) {
464       BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
465     }
466     return result;
467   }
468 };
469 
470 template <class Generator>
471 class NormalDistribution<Generator, double> {
472  public:
473   // The number of elements that will be returned.
474   static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
475   // Cost of generation of a single element (in cycles).
476   static constexpr int kElementCost = 70;
477   // Indicate that this distribution may take variable number of samples
478   // during the runtime.
479   static constexpr bool kVariableSamplesPerOutput = false;
480   typedef Array<double, kResultElementCount> ResultType;
481   typedef double ResultElementType;
482 
483   PHILOX_DEVICE_INLINE
operator()484   ResultType operator()(Generator* gen) {
485     typename Generator::ResultType sample = (*gen)();
486     ResultType result;
487     for (int i = 0; i < kResultElementCount; i += 2) {
488       const int i2 = 2 * i;
489       BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2],
490                       sample[i2 + 3], &result[i], &result[i + 1]);
491     }
492     return result;
493   }
494 };
495 
496 // A class that returns standard normal distribution between
497 // [-kTruncateValue, kTruncateValue].
498 // Arguments:
499 //   Generator: a generator type that returns a number of uint32 upon each
500 //              each invocation. It needs to define kResultElementCount for the
501 //              sample count for each invocation, and ResultType for actual
502 //              returned sample type.
503 //   RealType: the data type of the real numbers that will be returned by the
504 //             distribution. This could be either float or double for now.
505 // This class is meant to be implemented through specialization. The default
506 // is not defined by design.
507 template <class SingleSampleGenerator, typename RealType>
508 class TruncatedNormalDistribution;
509 
510 // Exactly like the float version, except that we convert to half afterwards;
511 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
512 // gain from working in half internally.
513 template <class SingleSampleGenerator>
514 class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
515  public:
516   // The number of elements that will be returned.
517   static constexpr int kResultElementCount =
518       SingleSampleGenerator::kNativeElementCount;
519   // Cost of generation of a single element (in cycles).
520   static constexpr int kElementCost = 90;
521   // Indicate that this distribution may take variable number of samples
522   // during the runtime.
523   static constexpr bool kVariableSamplesPerOutput = true;
524   // The threshold where the normal distribution is truncated.
525   const float kTruncateValue = 2.0f;
526 
527   typedef Array<Eigen::half, kResultElementCount> ResultType;
528   typedef Eigen::half ResultElementType;
529 
530   PHILOX_DEVICE_INLINE
operator()531   ResultType operator()(SingleSampleGenerator* gen) {
532     ResultType results;
533     int index = 0;
534     while (true) {
535       // Repeatedly take samples from the normal distribution, until we have
536       // the desired number of elements that fall within the pre-defined cutoff
537       // threshold.
538       const uint32 x0 = (*gen)();
539       const uint32 x1 = (*gen)();
540       float f[2];
541       BoxMullerFloat(x0, x1, &f[0], &f[1]);
542 
543       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
544         results[index++] = Eigen::half(f[0]);
545         if (index >= kResultElementCount) {
546           return results;
547         }
548       }
549       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
550         results[index++] = Eigen::half(f[1]);
551         if (index >= kResultElementCount) {
552           return results;
553         }
554       }
555     }
556   }
557 };
558 
559 template <class SingleSampleGenerator>
560 class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
561  public:
562   // The number of elements that will be returned.
563   static constexpr int kResultElementCount =
564       SingleSampleGenerator::kNativeElementCount;
565   // Cost of generation of a single element (in cycles).
566   static constexpr int kElementCost = 90;
567   // Indicate that this distribution may take variable number of samples
568   // during the runtime.
569   static constexpr bool kVariableSamplesPerOutput = true;
570   // The threshold where the normal distribution is truncated.
571   const float kTruncateValue = 2.0f;
572 
573   typedef Array<bfloat16, kResultElementCount> ResultType;
574   typedef bfloat16 ResultElementType;
575 
576   PHILOX_DEVICE_INLINE
operator()577   ResultType operator()(SingleSampleGenerator* gen) {
578     ResultType results;
579     int index = 0;
580     while (true) {
581       // Repeatedly take samples from the normal distribution, until we have
582       // the desired number of elements that fall within the pre-defined cutoff
583       // threshold.
584       const uint32 x0 = (*gen)();
585       const uint32 x1 = (*gen)();
586       float f[2];
587       BoxMullerFloat(x0, x1, &f[0], &f[1]);
588 
589       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
590         results[index++] = bfloat16(f[0]);
591         if (index >= kResultElementCount) {
592           return results;
593         }
594       }
595       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
596         results[index++] = bfloat16(f[1]);
597         if (index >= kResultElementCount) {
598           return results;
599         }
600       }
601     }
602   }
603 };
604 
605 // Partial specialization for float.
606 template <class SingleSampleGenerator>
607 class TruncatedNormalDistribution<SingleSampleGenerator, float> {
608  public:
609   // The number of elements that will be returned.
610   static constexpr int kResultElementCount =
611       SingleSampleGenerator::kNativeElementCount;
612   // Cost of generation of a single element (in cycles).
613   static constexpr int kElementCost = 90;
614   // Indicate that this distribution may take variable number of samples
615   // during the runtime.
616   static constexpr bool kVariableSamplesPerOutput = true;
617   // The threshold where the normal distribution is truncated.
618   const float kTruncateValue = 2.0f;
619 
620   typedef Array<float, kResultElementCount> ResultType;
621   typedef float ResultElementType;
622 
623   PHILOX_DEVICE_INLINE
operator()624   ResultType operator()(SingleSampleGenerator* gen) {
625     ResultType results;
626     int index = 0;
627     while (true) {
628       // Repeatedly take samples from the normal distribution, until we have
629       // the desired number of elements that fall within the pre-defined cutoff
630       // threshold.
631       const uint32 x0 = (*gen)();
632       const uint32 x1 = (*gen)();
633       float f[2];
634       BoxMullerFloat(x0, x1, &f[0], &f[1]);
635 
636       if (Eigen::numext::abs(f[0]) < kTruncateValue) {
637         results[index++] = f[0];
638         if (index >= kResultElementCount) {
639           return results;
640         }
641       }
642       if (Eigen::numext::abs(f[1]) < kTruncateValue) {
643         results[index++] = f[1];
644         if (index >= kResultElementCount) {
645           return results;
646         }
647       }
648     }
649   }
650 };
651 
652 // Partial specialization for double.
653 template <class SingleSampleGenerator>
654 class TruncatedNormalDistribution<SingleSampleGenerator, double> {
655  public:
656   // The number of elements that will be returned.
657   static constexpr int kResultElementCount =
658       (SingleSampleGenerator::kNativeElementCount > 1)
659           ? SingleSampleGenerator::kNativeElementCount / 2
660           : 1;
661   // Cost of generation of a single element (in cycles).
662   static constexpr int kElementCost = 90;
663   // Indicate that this distribution may take variable number of samples
664   // during the runtime.
665   static constexpr bool kVariableSamplesPerOutput = true;
666   typedef Array<double, kResultElementCount> ResultType;
667   typedef double ResultElementType;
668   const double kTruncateValue = 2.0;
669 
670   PHILOX_DEVICE_INLINE
operator()671   ResultType operator()(SingleSampleGenerator* gen) {
672     ResultType results;
673     int index = 0;
674     while (1) {
675       const uint32 x0 = (*gen)();
676       const uint32 x1 = (*gen)();
677       const uint32 x2 = (*gen)();
678       const uint32 x3 = (*gen)();
679       double d[2];
680       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
681 
682       if (Eigen::numext::abs(d[0]) < kTruncateValue) {
683         results[index++] = d[0];
684         if (index >= kResultElementCount) {
685           return results;
686         }
687       }
688       if (Eigen::numext::abs(d[1]) < kTruncateValue) {
689         results[index++] = d[1];
690         if (index >= kResultElementCount) {
691           return results;
692         }
693       }
694     }
695   }
696 };
697 
698 // Helper function to convert two 32-bit uniform integers to two floats
699 // under the unit normal distribution.
700 PHILOX_DEVICE_INLINE
BoxMullerFloat(uint32 x0,uint32 x1,float * f0,float * f1)701 void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
702   // This function implements the Box-Muller transform:
703   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
704   // Do not send a really small number to log().
705   // We cannot mark "epsilon" as "static const" because NVCC would complain
706   const float epsilon = 1.0e-7f;
707   float u1 = Uint32ToFloat(x0);
708   if (u1 < epsilon) {
709     u1 = epsilon;
710   }
711   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
712   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
713 #if !defined(__linux__)
714   *f0 = Eigen::numext::sin(v1);
715   *f1 = Eigen::numext::cos(v1);
716 #else
717   sincosf(v1, f0, f1);
718 #endif
719   *f0 *= u2;
720   *f1 *= u2;
721 }
722 
723 // Helper function to convert four 32-bit uniform integers to two doubles
724 // under the unit normal distribution.
725 PHILOX_DEVICE_INLINE
BoxMullerDouble(uint32 x0,uint32 x1,uint32 x2,uint32 x3,double * d0,double * d1)726 void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
727                      double* d1) {
728   // This function implements the Box-Muller transform:
729   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
730   // Do not send a really small number to log().
731   // We cannot mark "epsilon" as "static const" because NVCC would complain
732   const double epsilon = 1.0e-7;
733   double u1 = Uint64ToDouble(x0, x1);
734   if (u1 < epsilon) {
735     u1 = epsilon;
736   }
737   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
738   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
739 #if !defined(__linux__)
740   *d0 = Eigen::numext::sin(v1);
741   *d1 = Eigen::numext::cos(v1);
742 #else
743   sincos(v1, d0, d1);
744 #endif
745   *d0 *= u2;
746   *d1 *= u2;
747 }
748 
749 // Helper function to convert an 16-bit integer to a half between [0..1).
Uint16ToHalf(uint16 x)750 PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
751   // IEEE754 halfs are formatted as follows (MSB first):
752   //    sign(1) exponent(5) mantissa(10)
753   // Conceptually construct the following:
754   //    sign == 0
755   //    exponent == 15  -- an excess 15 representation of a zero exponent
756   //    mantissa == 10 random bits
757   const uint16 man = x & 0x3ffu;  // 10 bit mantissa
758   const uint16 exp = static_cast<uint16>(15);
759   const uint16 val = (exp << 10) | man;
760 
761   Eigen::half result;
762   result.x = val;
763   return result - Eigen::half(1.0);
764 }
765 
766 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
767 // This can create a uniform distribution of values between [0..1).
Uint16ToGfloat16(uint16 x)768 PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
769   // bfloat are formatted as follows (MSB first):
770   //    sign(1) exponent(8) mantissa(7)
771   // Conceptually construct the following:
772   //    sign == 0
773   //    exponent == 127  -- an excess 127 representation of a zero exponent
774   //    mantissa == 7 random bits
775   const uint16 man = x & 0x7fu;  // 7 bit mantissa
776   const uint16 exp = static_cast<uint16>(127);
777   const uint16 val = (exp << 7) | man;
778 
779   bfloat16 result;
780   memcpy(&result, &val, sizeof(val));
781   // The mantissa has an implicit leading 1, so the above code creates a value
782   // in [1, 2). The minus will not cause a rounding that makes the result 1.
783   // Instead it will just be close to 1.
784   return result - bfloat16(1.0);
785 }
786 
787 // Helper function to convert an 32-bit integer to a float between [0..1).
Uint32ToFloat(uint32 x)788 PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32 x) {
789   // IEEE754 floats are formatted as follows (MSB first):
790   //    sign(1) exponent(8) mantissa(23)
791   // Conceptually construct the following:
792   //    sign == 0
793   //    exponent == 127  -- an excess 127 representation of a zero exponent
794   //    mantissa == 23 random bits
795   const uint32 man = x & 0x7fffffu;  // 23 bit mantissa
796   const uint32 exp = static_cast<uint32>(127);
797   const uint32 val = (exp << 23) | man;
798 
799   // Assumes that endian-ness is same for float and uint32.
800   float result;
801   memcpy(&result, &val, sizeof(val));
802   return result - 1.0f;
803 }
804 
805 // Helper function to convert two 32-bit integers to a double between [0..1).
Uint64ToDouble(uint32 x0,uint32 x1)806 PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32 x0, uint32 x1) {
807   // IEEE754 doubles are formatted as follows (MSB first):
808   //    sign(1) exponent(11) mantissa(52)
809   // Conceptually construct the following:
810   //    sign == 0
811   //    exponent == 1023  -- an excess 1023 representation of a zero exponent
812   //    mantissa == 52 random bits
813   const uint32 mhi = x0 & 0xfffffu;  // upper 20 bits of mantissa
814   const uint32 mlo = x1;             // lower 32 bits of mantissa
815   const uint64 man = (static_cast<uint64>(mhi) << 32) | mlo;  // mantissa
816   const uint64 exp = static_cast<uint64>(1023);
817   const uint64 val = (exp << 52) | man;
818   // Assumes that endian-ness is same for double and uint64.
819   double result;
820   memcpy(&result, &val, sizeof(val));
821   return result - 1.0;
822 }
823 
824 }  // namespace random
825 }  // namespace tensorflow
826 
827 #endif  // TENSORFLOW_CORE_LIB_RANDOM_RANDOM_DISTRIBUTIONS_H_
828