1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Implement the Philox algorithm to generate random numbers in parallel.
17 // Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
18 //   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
19 
20 #ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
21 #define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
22 
23 #include <stdlib.h>
24 
25 #include "tensorflow/core/platform/types.h"
26 
27 // Function qualifiers that need to work on both CPU and GPU.
28 #if defined(__CUDACC__) || defined(__HIPCC__)
29 // For nvcc.
30 #define PHILOX_DEVICE_FUNC __host__ __device__
31 #define PHILOX_INLINE __inline__
32 #else
33 // For non-nvcc.
34 #define PHILOX_DEVICE_FUNC
35 #define PHILOX_INLINE inline
36 #endif
37 #define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
38 
39 #include <math.h>
40 
41 namespace tensorflow {
42 namespace random {
43 
44 // A class that represents an inline array. It can be used on both CPU and GPU,
45 // and also trivially copyable between CPU and GPU.
46 // Arguments:
47 //   T: the array element type;
48 //   ElementCount: the fixed size of the array;
49 template <typename T, int ElementCount>
50 class Array {
51  public:
52   static constexpr int kElementCount = ElementCount;
Array()53   PHILOX_DEVICE_INLINE Array() {
54     for (int i = 0; i < ElementCount; ++i) {
55       data_[i] = T(0);
56     }
57   }
58 
59   PHILOX_DEVICE_INLINE const T& operator[](int index) const {
60     return data_[index];
61   }
62 
63   PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; }
64 
size()65   size_t size() const { return ElementCount; }
66 
67  private:
68   T data_[ElementCount];
69 };
70 
71 // A class that encapsulates all the states for a random number generator using
72 // the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
73 // in the form of four uint32.
74 // There are multiple variants of this algorithm, we picked the 4x32_10 version
75 // that is most suited for our applications.
76 // Since this class is meant to be copied between CPU to GPU, it maintains a
77 // value semantics.
78 //
79 // For example: To use this class and populate an array of 1024 randoms on CPU
80 // with two threads,
81 //
82 //  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
83 //    assert(start % 4 == 0);
84 //    assert(limit % 4 == 0);
85 //    rnd.Skip(start / 4);
86 //    for (int i = start; i < limit; i += 4) {
87 //      auto sample = rnd();
88 //      ... copy sample[0..3] to output[i..i+3]
89 //    }
90 //  }
91 //
92 //  PhiloxRandom rng(seed);
93 //  PhiloxRandom rng_copy = rng;
94 //  rng.Skip(1000/4);
95 //
96 //  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
97 //  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
98 //  ... wait for thread 1 & 2 to finish executing Fill().
99 //
100 // NOTE:
101 // 1. PhiloxRandom is trivially copyable.
102 // 2. PhiloxRandom is compilable by gcc and nvcc.
103 class PhiloxRandom {
104  public:
105   using ResultType = Array<uint32, 4>;
106   using ResultElementType = uint32;
107   // The number of elements that will be returned.
108   static constexpr int kResultElementCount = 4;
109   // Cost of generation of a single element (in cycles).
110   static constexpr int kElementCost = 10;
111   // The type for the 64-bit key stored in the form of two 32-bit uint
112   // that are used in the diffusion process.
113   using Key = Array<uint32, 2>;
114 
115   PHILOX_DEVICE_INLINE
PhiloxRandom()116   PhiloxRandom() {}
117 
118   PHILOX_DEVICE_INLINE
PhiloxRandom(uint64 seed)119   explicit PhiloxRandom(uint64 seed) {
120     key_[0] = static_cast<uint32>(seed);
121     key_[1] = static_cast<uint32>(seed >> 32);
122   }
123 
124   PHILOX_DEVICE_INLINE
PhiloxRandom(uint64 seed_lo,uint64 seed_hi)125   explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) {
126     key_[0] = static_cast<uint32>(seed_lo);
127     key_[1] = static_cast<uint32>(seed_lo >> 32);
128     counter_[2] = static_cast<uint32>(seed_hi);
129     counter_[3] = static_cast<uint32>(seed_hi >> 32);
130   }
131 
132   PHILOX_DEVICE_INLINE
PhiloxRandom(ResultType counter,Key key)133   PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
134 
135   PHILOX_DEVICE_INLINE
counter()136   ResultType const& counter() const { return counter_; }
137 
138   PHILOX_DEVICE_INLINE
key()139   Key const& key() const { return key_; }
140 
141   // Skip the specified number of samples of 128-bits in the current stream.
142   PHILOX_DEVICE_INLINE
Skip(uint64 count)143   void Skip(uint64 count) {
144     const uint32 count_lo = static_cast<uint32>(count);
145     uint32 count_hi = static_cast<uint32>(count >> 32);
146 
147     counter_[0] += count_lo;
148     if (counter_[0] < count_lo) {
149       ++count_hi;
150     }
151 
152     counter_[1] += count_hi;
153     if (counter_[1] < count_hi) {
154       if (++counter_[2] == 0) {
155         ++counter_[3];
156       }
157     }
158   }
159 
160   // Returns a group of four random numbers using the underlying Philox
161   // algorithm.
operator()162   PHILOX_DEVICE_INLINE ResultType operator()() {
163     ResultType counter = counter_;
164     Key key = key_;
165 
166     // Run the single rounds for ten times. Manually unrolling the loop
167     // for better performance.
168     counter = ComputeSingleRound(counter, key);
169     RaiseKey(&key);
170     counter = ComputeSingleRound(counter, key);
171     RaiseKey(&key);
172     counter = ComputeSingleRound(counter, key);
173     RaiseKey(&key);
174     counter = ComputeSingleRound(counter, key);
175     RaiseKey(&key);
176     counter = ComputeSingleRound(counter, key);
177     RaiseKey(&key);
178     counter = ComputeSingleRound(counter, key);
179     RaiseKey(&key);
180     counter = ComputeSingleRound(counter, key);
181     RaiseKey(&key);
182     counter = ComputeSingleRound(counter, key);
183     RaiseKey(&key);
184     counter = ComputeSingleRound(counter, key);
185     RaiseKey(&key);
186     counter = ComputeSingleRound(counter, key);
187 
188     SkipOne();
189 
190     return counter;
191   }
192 
193  private:
194   // We use the same constants as recommended by the original paper.
195   static constexpr uint32 kPhiloxW32A = 0x9E3779B9;
196   static constexpr uint32 kPhiloxW32B = 0xBB67AE85;
197   static constexpr uint32 kPhiloxM4x32A = 0xD2511F53;
198   static constexpr uint32 kPhiloxM4x32B = 0xCD9E8D57;
199 
200   // Helper function to skip the next sample of 128-bits in the current stream.
SkipOne()201   PHILOX_DEVICE_INLINE void SkipOne() {
202     if (++counter_[0] == 0) {
203       if (++counter_[1] == 0) {
204         if (++counter_[2] == 0) {
205           ++counter_[3];
206         }
207       }
208     }
209   }
210 
211   // Helper function to return the lower and higher 32-bits from two 32-bit
212   // integer multiplications.
213   PHILOX_DEVICE_INLINE
MultiplyHighLow(uint32 a,uint32 b,uint32 * result_low,uint32 * result_high)214   static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low,
215                               uint32* result_high) {
216 #ifndef __CUDA_ARCH__
217     const uint64 product = static_cast<uint64>(a) * b;
218     *result_low = static_cast<uint32>(product);
219     *result_high = static_cast<uint32>(product >> 32);
220 #else
221     *result_low = a * b;
222     *result_high = __umulhi(a, b);
223 #endif
224   }
225 
226   // Helper function for a single round of the underlying Philox algorithm.
ComputeSingleRound(const ResultType & counter,const Key & key)227   PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(
228       const ResultType& counter, const Key& key) {
229     uint32 lo0;
230     uint32 hi0;
231     MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
232 
233     uint32 lo1;
234     uint32 hi1;
235     MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
236 
237     ResultType result;
238     result[0] = hi1 ^ counter[1] ^ key[0];
239     result[1] = lo1;
240     result[2] = hi0 ^ counter[3] ^ key[1];
241     result[3] = lo0;
242     return result;
243   }
244 
RaiseKey(Key * key)245   PHILOX_DEVICE_INLINE void RaiseKey(Key* key) {
246     (*key)[0] += kPhiloxW32A;
247     (*key)[1] += kPhiloxW32B;
248   }
249 
250  private:
251   ResultType counter_;
252   Key key_;
253 };
254 
255 }  // namespace random
256 }  // namespace tensorflow
257 
258 #endif  // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
259