philox_random.h revision 6855bcc08cfcbba1a20699b3d53458a490cde2a8
1/* Copyright 2015 Google Inc. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// Implement the Philox algorithm to generate random numbers in parallel.
17// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
18//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
19
20#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
21#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
22
23#include <stdlib.h>
24
25#include "tensorflow/core/platform/types.h"
26
27// Function qualifiers that need to work on both CPU and GPU.
28#if defined(__CUDACC__)
29// For nvcc.
30#define PHILOX_DEVICE_FUNC __host__ __device__
31#define PHILOX_INLINE __inline__
32#else
33// For non-nvcc.
34#define PHILOX_DEVICE_FUNC
35#define PHILOX_INLINE inline
36#endif
37#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
38
39#include <math.h>
40
41namespace tensorflow {
42namespace random {
43
44// A class that represents an inline array. It can be used on both CPU and GPU,
45// and also trivially copyable between CPU and GPU.
46// Arguments:
47//   T: the array element type;
48//   ElementCount: the fixed size of the array;
49template <typename T, int ElementCount>
50class Array {
51 public:
52  PHILOX_DEVICE_INLINE Array() {
53    for (int i = 0; i < ElementCount; ++i) {
54      data_[i] = T();
55    }
56  }
57
58  PHILOX_DEVICE_INLINE const T& operator[](int index) const {
59    return data_[index];
60  }
61
62  PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; }
63
64  size_t size() const { return ElementCount; }
65
66 private:
67  T data_[ElementCount];
68};
69
70// A class that encapsulates all the states for a random number generator using
71// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
72// in the form of four uint32.
73// There are multiple variants of this algorithm, we picked the 4x32_10 version
74// that is most suited for our applications.
75// Since this class is meant to be copied between CPU to GPU, it maintains a
76// value semantics.
77//
78// For example: To use this class and populate an array of 1024 randoms on CPU
79// with two threads,
80//
81//  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
82//    assert(start % 4 == 0);
83//    assert(limit % 4 == 0);
84//    rnd.Skip(start / 4);
85//    for (int i = start; i < limit; i += 4) {
86//      auto sample = rnd();
87//      ... copy sample[0..3] to output[i..i+3]
88//    }
89//  }
90//
91//  PhiloxRandom rng(seed);
92//  PhiloxRandom rng_copy = rng;
93//  rng.Skip(1000/4);
94//
95//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
96//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
97//  ... wait for thread 1 & 2 to finish executing Fill().
98//
99// NOTE:
100// 1. PhiloxRandom is trivially copyable.
101// 2. PhiloxRandom is compilable by gcc and nvcc.
102class PhiloxRandom {
103 public:
104  typedef Array<uint32, 4> ResultType;
105  typedef uint32 ResultElementType;
106  // The number of elements that will be returned.
107  static const int kResultElementCount = 4;
108  // Cost of generation of a single element (in cycles).
109  static const int kElementCost = 10;
110
111  PHILOX_DEVICE_INLINE
112  PhiloxRandom() {}
113
114  PHILOX_DEVICE_INLINE
115  explicit PhiloxRandom(uint64 seed) {
116    key_[0] = static_cast<uint32>(seed);
117    key_[1] = static_cast<uint32>(seed >> 32);
118  }
119
120  PHILOX_DEVICE_INLINE
121  explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) {
122    key_[0] = static_cast<uint32>(seed_lo);
123    key_[1] = static_cast<uint32>(seed_lo >> 32);
124    counter_[2] = static_cast<uint32>(seed_hi);
125    counter_[3] = static_cast<uint32>(seed_hi >> 32);
126  }
127
128  // Skip the specified number of samples of 128-bits in the current stream.
129  PHILOX_DEVICE_INLINE
130  void Skip(uint64 count) {
131    const uint32 count_lo = static_cast<uint32>(count);
132    uint32 count_hi = static_cast<uint32>(count >> 32);
133
134    counter_[0] += count_lo;
135    if (counter_[0] < count_lo) {
136      ++count_hi;
137    }
138
139    counter_[1] += count_hi;
140    if (counter_[1] < count_hi) {
141      if (++counter_[2] == 0) {
142        ++counter_[3];
143      }
144    }
145  }
146
147  // Returns a group of four random numbers using the underlying Philox
148  // algorithm.
149  PHILOX_DEVICE_INLINE ResultType operator()() {
150    ResultType counter = counter_;
151    Key key = key_;
152
153    // Run the single rounds for ten times. Manually unrolling the loop
154    // for better performance.
155    counter = ComputeSingleRound(counter, key);
156    RaiseKey(&key);
157    counter = ComputeSingleRound(counter, key);
158    RaiseKey(&key);
159    counter = ComputeSingleRound(counter, key);
160    RaiseKey(&key);
161    counter = ComputeSingleRound(counter, key);
162    RaiseKey(&key);
163    counter = ComputeSingleRound(counter, key);
164    RaiseKey(&key);
165    counter = ComputeSingleRound(counter, key);
166    RaiseKey(&key);
167    counter = ComputeSingleRound(counter, key);
168    RaiseKey(&key);
169    counter = ComputeSingleRound(counter, key);
170    RaiseKey(&key);
171    counter = ComputeSingleRound(counter, key);
172    RaiseKey(&key);
173    counter = ComputeSingleRound(counter, key);
174
175    SkipOne();
176
177    return counter;
178  }
179
180 private:
181  // The type for the 64-bit key stored in the form of two 32-bit uint
182  // that are used in the diffusion process.
183  typedef Array<uint32, 2> Key;
184
185  // We use the same constants as recommended by the original paper.
186  static const uint32 kPhiloxW32A = 0x9E3779B9;
187  static const uint32 kPhiloxW32B = 0xBB67AE85;
188  static const uint32 kPhiloxM4x32A = 0xD2511F53;
189  static const uint32 kPhiloxM4x32B = 0xCD9E8D57;
190
191  // Helper function to skip the next sample of 128-bits in the current stream.
192  PHILOX_DEVICE_INLINE void SkipOne() {
193    if (++counter_[0] == 0) {
194      if (++counter_[1] == 0) {
195        if (++counter_[2] == 0) {
196          ++counter_[3];
197        }
198      }
199    }
200  }
201
202  // Helper function to return the lower and higher 32-bits from two 32-bit
203  // integer multiplications.
204  PHILOX_DEVICE_INLINE
205  static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low,
206                              uint32* result_high) {
207#ifndef __GCUDACC__
208    const uint64 product = static_cast<uint64>(a) * b;
209    *result_low = static_cast<uint32>(product);
210    *result_high = static_cast<uint32>(product >> 32);
211#else
212    *result_low = a * b;
213    *result_high = __umulhi(a, b);
214#endif
215  }
216
217  // Helper function for a single round of the underlying Philox algorithm.
218  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(
219      const ResultType& counter, const Key& key) {
220    uint32 lo0;
221    uint32 hi0;
222    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
223
224    uint32 lo1;
225    uint32 hi1;
226    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
227
228    ResultType result;
229    result[0] = hi1 ^ counter[1] ^ key[0];
230    result[1] = lo1;
231    result[2] = hi0 ^ counter[3] ^ key[1];
232    result[3] = lo0;
233    return result;
234  }
235
236  PHILOX_DEVICE_INLINE void RaiseKey(Key* key) {
237    (*key)[0] += kPhiloxW32A;
238    (*key)[1] += kPhiloxW32B;
239  }
240
241 private:
242  ResultType counter_;
243  Key key_;
244};
245
246}  // namespace random
247}  // namespace tensorflow
248
249#endif  // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
250