philox_random.h revision 6855bcc08cfcbba1a20699b3d53458a490cde2a8
1/* Copyright 2015 Google Inc. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16// Implement the Philox algorithm to generate random numbers in parallel. 17// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. 18// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf 19 20#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 21#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 22 23#include <stdlib.h> 24 25#include "tensorflow/core/platform/types.h" 26 27// Function qualifiers that need to work on both CPU and GPU. 28#if defined(__CUDACC__) 29// For nvcc. 30#define PHILOX_DEVICE_FUNC __host__ __device__ 31#define PHILOX_INLINE __inline__ 32#else 33// For non-nvcc. 34#define PHILOX_DEVICE_FUNC 35#define PHILOX_INLINE inline 36#endif 37#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE 38 39#include <math.h> 40 41namespace tensorflow { 42namespace random { 43 44// A class that represents an inline array. It can be used on both CPU and GPU, 45// and also trivially copyable between CPU and GPU. 46// Arguments: 47// T: the array element type; 48// ElementCount: the fixed size of the array; 49template <typename T, int ElementCount> 50class Array { 51 public: 52 PHILOX_DEVICE_INLINE Array() { 53 for (int i = 0; i < ElementCount; ++i) { 54 data_[i] = T(); 55 } 56 } 57 58 PHILOX_DEVICE_INLINE const T& operator[](int index) const { 59 return data_[index]; 60 } 61 62 PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; } 63 64 size_t size() const { return ElementCount; } 65 66 private: 67 T data_[ElementCount]; 68}; 69 70// A class that encapsulates all the states for a random number generator using 71// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits 72// in the form of four uint32. 73// There are multiple variants of this algorithm, we picked the 4x32_10 version 74// that is most suited for our applications. 75// Since this class is meant to be copied between CPU to GPU, it maintains a 76// value semantics. 77// 78// For example: To use this class and populate an array of 1024 randoms on CPU 79// with two threads, 80// 81// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { 82// assert(start % 4 == 0); 83// assert(limit % 4 == 0); 84// rnd.Skip(start / 4); 85// for (int i = start; i < limit; i += 4) { 86// auto sample = rnd(); 87// ... copy sample[0..3] to output[i..i+3] 88// } 89// } 90// 91// PhiloxRandom rng(seed); 92// PhiloxRandom rng_copy = rng; 93// rng.Skip(1000/4); 94// 95// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; 96// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; 97// ... wait for thread 1 & 2 to finish executing Fill(). 98// 99// NOTE: 100// 1. PhiloxRandom is trivially copyable. 101// 2. PhiloxRandom is compilable by gcc and nvcc. 102class PhiloxRandom { 103 public: 104 typedef Array<uint32, 4> ResultType; 105 typedef uint32 ResultElementType; 106 // The number of elements that will be returned. 107 static const int kResultElementCount = 4; 108 // Cost of generation of a single element (in cycles). 109 static const int kElementCost = 10; 110 111 PHILOX_DEVICE_INLINE 112 PhiloxRandom() {} 113 114 PHILOX_DEVICE_INLINE 115 explicit PhiloxRandom(uint64 seed) { 116 key_[0] = static_cast<uint32>(seed); 117 key_[1] = static_cast<uint32>(seed >> 32); 118 } 119 120 PHILOX_DEVICE_INLINE 121 explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) { 122 key_[0] = static_cast<uint32>(seed_lo); 123 key_[1] = static_cast<uint32>(seed_lo >> 32); 124 counter_[2] = static_cast<uint32>(seed_hi); 125 counter_[3] = static_cast<uint32>(seed_hi >> 32); 126 } 127 128 // Skip the specified number of samples of 128-bits in the current stream. 129 PHILOX_DEVICE_INLINE 130 void Skip(uint64 count) { 131 const uint32 count_lo = static_cast<uint32>(count); 132 uint32 count_hi = static_cast<uint32>(count >> 32); 133 134 counter_[0] += count_lo; 135 if (counter_[0] < count_lo) { 136 ++count_hi; 137 } 138 139 counter_[1] += count_hi; 140 if (counter_[1] < count_hi) { 141 if (++counter_[2] == 0) { 142 ++counter_[3]; 143 } 144 } 145 } 146 147 // Returns a group of four random numbers using the underlying Philox 148 // algorithm. 149 PHILOX_DEVICE_INLINE ResultType operator()() { 150 ResultType counter = counter_; 151 Key key = key_; 152 153 // Run the single rounds for ten times. Manually unrolling the loop 154 // for better performance. 155 counter = ComputeSingleRound(counter, key); 156 RaiseKey(&key); 157 counter = ComputeSingleRound(counter, key); 158 RaiseKey(&key); 159 counter = ComputeSingleRound(counter, key); 160 RaiseKey(&key); 161 counter = ComputeSingleRound(counter, key); 162 RaiseKey(&key); 163 counter = ComputeSingleRound(counter, key); 164 RaiseKey(&key); 165 counter = ComputeSingleRound(counter, key); 166 RaiseKey(&key); 167 counter = ComputeSingleRound(counter, key); 168 RaiseKey(&key); 169 counter = ComputeSingleRound(counter, key); 170 RaiseKey(&key); 171 counter = ComputeSingleRound(counter, key); 172 RaiseKey(&key); 173 counter = ComputeSingleRound(counter, key); 174 175 SkipOne(); 176 177 return counter; 178 } 179 180 private: 181 // The type for the 64-bit key stored in the form of two 32-bit uint 182 // that are used in the diffusion process. 183 typedef Array<uint32, 2> Key; 184 185 // We use the same constants as recommended by the original paper. 186 static const uint32 kPhiloxW32A = 0x9E3779B9; 187 static const uint32 kPhiloxW32B = 0xBB67AE85; 188 static const uint32 kPhiloxM4x32A = 0xD2511F53; 189 static const uint32 kPhiloxM4x32B = 0xCD9E8D57; 190 191 // Helper function to skip the next sample of 128-bits in the current stream. 192 PHILOX_DEVICE_INLINE void SkipOne() { 193 if (++counter_[0] == 0) { 194 if (++counter_[1] == 0) { 195 if (++counter_[2] == 0) { 196 ++counter_[3]; 197 } 198 } 199 } 200 } 201 202 // Helper function to return the lower and higher 32-bits from two 32-bit 203 // integer multiplications. 204 PHILOX_DEVICE_INLINE 205 static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low, 206 uint32* result_high) { 207#ifndef __GCUDACC__ 208 const uint64 product = static_cast<uint64>(a) * b; 209 *result_low = static_cast<uint32>(product); 210 *result_high = static_cast<uint32>(product >> 32); 211#else 212 *result_low = a * b; 213 *result_high = __umulhi(a, b); 214#endif 215 } 216 217 // Helper function for a single round of the underlying Philox algorithm. 218 PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound( 219 const ResultType& counter, const Key& key) { 220 uint32 lo0; 221 uint32 hi0; 222 MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); 223 224 uint32 lo1; 225 uint32 hi1; 226 MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); 227 228 ResultType result; 229 result[0] = hi1 ^ counter[1] ^ key[0]; 230 result[1] = lo1; 231 result[2] = hi0 ^ counter[3] ^ key[1]; 232 result[3] = lo0; 233 return result; 234 } 235 236 PHILOX_DEVICE_INLINE void RaiseKey(Key* key) { 237 (*key)[0] += kPhiloxW32A; 238 (*key)[1] += kPhiloxW32B; 239 } 240 241 private: 242 ResultType counter_; 243 Key key_; 244}; 245 246} // namespace random 247} // namespace tensorflow 248 249#endif // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 250