1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16// Implement the Philox algorithm to generate random numbers in parallel. 17// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. 18// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf 19 20#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 21#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 22 23#include <stdlib.h> 24 25#include "tensorflow/core/platform/types.h" 26 27// Function qualifiers that need to work on both CPU and GPU. 28#if defined(__CUDACC__) 29// For nvcc. 30#define PHILOX_DEVICE_FUNC __host__ __device__ 31#define PHILOX_INLINE __inline__ 32#else 33// For non-nvcc. 34#define PHILOX_DEVICE_FUNC 35#define PHILOX_INLINE inline 36#endif 37#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE 38 39#include <math.h> 40 41namespace tensorflow { 42namespace random { 43 44// A class that represents an inline array. It can be used on both CPU and GPU, 45// and also trivially copyable between CPU and GPU. 46// Arguments: 47// T: the array element type; 48// ElementCount: the fixed size of the array; 49template <typename T, int ElementCount> 50class Array { 51 public: 52 PHILOX_DEVICE_INLINE Array() { 53 for (int i = 0; i < ElementCount; ++i) { 54 data_[i] = T(0); 55 } 56 } 57 58 PHILOX_DEVICE_INLINE const T& operator[](int index) const { 59 return data_[index]; 60 } 61 62 PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; } 63 64 size_t size() const { return ElementCount; } 65 66 private: 67 T data_[ElementCount]; 68}; 69 70// A class that encapsulates all the states for a random number generator using 71// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits 72// in the form of four uint32. 73// There are multiple variants of this algorithm, we picked the 4x32_10 version 74// that is most suited for our applications. 75// Since this class is meant to be copied between CPU to GPU, it maintains a 76// value semantics. 77// 78// For example: To use this class and populate an array of 1024 randoms on CPU 79// with two threads, 80// 81// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { 82// assert(start % 4 == 0); 83// assert(limit % 4 == 0); 84// rnd.Skip(start / 4); 85// for (int i = start; i < limit; i += 4) { 86// auto sample = rnd(); 87// ... copy sample[0..3] to output[i..i+3] 88// } 89// } 90// 91// PhiloxRandom rng(seed); 92// PhiloxRandom rng_copy = rng; 93// rng.Skip(1000/4); 94// 95// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; 96// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; 97// ... wait for thread 1 & 2 to finish executing Fill(). 98// 99// NOTE: 100// 1. PhiloxRandom is trivially copyable. 101// 2. PhiloxRandom is compilable by gcc and nvcc. 102class PhiloxRandom { 103 public: 104 using ResultType = Array<uint32, 4>; 105 using ResultElementType = uint32; 106 // The number of elements that will be returned. 107 static const int kResultElementCount = 4; 108 // Cost of generation of a single element (in cycles). 109 static const int kElementCost = 10; 110 // The type for the 64-bit key stored in the form of two 32-bit uint 111 // that are used in the diffusion process. 112 using Key = Array<uint32, 2>; 113 114 PHILOX_DEVICE_INLINE 115 PhiloxRandom() {} 116 117 PHILOX_DEVICE_INLINE 118 explicit PhiloxRandom(uint64 seed) { 119 key_[0] = static_cast<uint32>(seed); 120 key_[1] = static_cast<uint32>(seed >> 32); 121 } 122 123 PHILOX_DEVICE_INLINE 124 explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) { 125 key_[0] = static_cast<uint32>(seed_lo); 126 key_[1] = static_cast<uint32>(seed_lo >> 32); 127 counter_[2] = static_cast<uint32>(seed_hi); 128 counter_[3] = static_cast<uint32>(seed_hi >> 32); 129 } 130 131 PHILOX_DEVICE_INLINE 132 PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {} 133 134 // Skip the specified number of samples of 128-bits in the current stream. 135 PHILOX_DEVICE_INLINE 136 void Skip(uint64 count) { 137 const uint32 count_lo = static_cast<uint32>(count); 138 uint32 count_hi = static_cast<uint32>(count >> 32); 139 140 counter_[0] += count_lo; 141 if (counter_[0] < count_lo) { 142 ++count_hi; 143 } 144 145 counter_[1] += count_hi; 146 if (counter_[1] < count_hi) { 147 if (++counter_[2] == 0) { 148 ++counter_[3]; 149 } 150 } 151 } 152 153 // Returns a group of four random numbers using the underlying Philox 154 // algorithm. 155 PHILOX_DEVICE_INLINE ResultType operator()() { 156 ResultType counter = counter_; 157 Key key = key_; 158 159 // Run the single rounds for ten times. Manually unrolling the loop 160 // for better performance. 161 counter = ComputeSingleRound(counter, key); 162 RaiseKey(&key); 163 counter = ComputeSingleRound(counter, key); 164 RaiseKey(&key); 165 counter = ComputeSingleRound(counter, key); 166 RaiseKey(&key); 167 counter = ComputeSingleRound(counter, key); 168 RaiseKey(&key); 169 counter = ComputeSingleRound(counter, key); 170 RaiseKey(&key); 171 counter = ComputeSingleRound(counter, key); 172 RaiseKey(&key); 173 counter = ComputeSingleRound(counter, key); 174 RaiseKey(&key); 175 counter = ComputeSingleRound(counter, key); 176 RaiseKey(&key); 177 counter = ComputeSingleRound(counter, key); 178 RaiseKey(&key); 179 counter = ComputeSingleRound(counter, key); 180 181 SkipOne(); 182 183 return counter; 184 } 185 186 private: 187 // We use the same constants as recommended by the original paper. 188 static const uint32 kPhiloxW32A = 0x9E3779B9; 189 static const uint32 kPhiloxW32B = 0xBB67AE85; 190 static const uint32 kPhiloxM4x32A = 0xD2511F53; 191 static const uint32 kPhiloxM4x32B = 0xCD9E8D57; 192 193 // Helper function to skip the next sample of 128-bits in the current stream. 194 PHILOX_DEVICE_INLINE void SkipOne() { 195 if (++counter_[0] == 0) { 196 if (++counter_[1] == 0) { 197 if (++counter_[2] == 0) { 198 ++counter_[3]; 199 } 200 } 201 } 202 } 203 204 // Helper function to return the lower and higher 32-bits from two 32-bit 205 // integer multiplications. 206 PHILOX_DEVICE_INLINE 207 static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low, 208 uint32* result_high) { 209#ifndef __CUDA_ARCH__ 210 const uint64 product = static_cast<uint64>(a) * b; 211 *result_low = static_cast<uint32>(product); 212 *result_high = static_cast<uint32>(product >> 32); 213#else 214 *result_low = a * b; 215 *result_high = __umulhi(a, b); 216#endif 217 } 218 219 // Helper function for a single round of the underlying Philox algorithm. 220 PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound( 221 const ResultType& counter, const Key& key) { 222 uint32 lo0; 223 uint32 hi0; 224 MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); 225 226 uint32 lo1; 227 uint32 hi1; 228 MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); 229 230 ResultType result; 231 result[0] = hi1 ^ counter[1] ^ key[0]; 232 result[1] = lo1; 233 result[2] = hi0 ^ counter[3] ^ key[1]; 234 result[3] = lo0; 235 return result; 236 } 237 238 PHILOX_DEVICE_INLINE void RaiseKey(Key* key) { 239 (*key)[0] += kPhiloxW32A; 240 (*key)[1] += kPhiloxW32B; 241 } 242 243 private: 244 ResultType counter_; 245 Key key_; 246}; 247 248} // namespace random 249} // namespace tensorflow 250 251#endif // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 252