philox_random.h revision 895a0a423bca9118faa66fb144d434e5c06751cc
19c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur/* Copyright 2015 Google Inc. All Rights Reserved. 29c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur 39c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurLicensed under the Apache License, Version 2.0 (the "License"); 49c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudluryou may not use this file except in compliance with the License. 59c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurYou may obtain a copy of the License at 69c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur 79c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur http://www.apache.org/licenses/LICENSE-2.0 89c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur 99c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurUnless required by applicable law or agreed to in writing, software 109c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurdistributed under the License is distributed on an "AS IS" BASIS, 119c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 129c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurSee the License for the specific language governing permissions and 139c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurlimitations under the License. 149c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur==============================================================================*/ 159c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur 16f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Implement the Philox algorithm to generate random numbers in parallel. 17f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3. 18f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf 19f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 20f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 21f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 22f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 23f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <stdlib.h> 24f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 255a24d3a2514698b0ae11563b2ea21e368de48a4fJosh Levenberg#include "tensorflow/core/platform/types.h" 26f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 27f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Function qualifiers that need to work on both CPU and GPU. 28e1ac9aea1ca0a5aac83fa5b0f1e0929e412e44bdA. Unique TensorFlower#if defined(__CUDACC__) 29f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For nvcc. 30f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC __host__ __device__ 31f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE __inline__ 32f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else 33f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For non-nvcc. 34f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC 35f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE inline 36f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif 37f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE 38f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 39f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <math.h> 40f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 41f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace tensorflow { 42f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace random { 43f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 44f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that represents an inline array. It can be used on both CPU and GPU, 45f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// and also trivially copyable between CPU and GPU. 46f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Arguments: 47f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// T: the array element type; 48f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// ElementCount: the fixed size of the array; 49f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurtemplate <typename T, int ElementCount> 50f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass Array { 51f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public: 52f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE Array() { 53f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur for (int i = 0; i < ElementCount; ++i) { 54f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur data_[i] = T(); 55f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 56f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 57f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 58f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE const T& operator[](int index) const { 59f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur return data_[index]; 60f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 61f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 62f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; } 63f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 64f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur size_t size() const { return ElementCount; } 65f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 66f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private: 67f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur T data_[ElementCount]; 68f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}; 69f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 70f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that encapsulates all the states for a random number generator using 71f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits 72f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// in the form of four uint32. 73f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// There are multiple variants of this algorithm, we picked the 4x32_10 version 74f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// that is most suited for our applications. 75f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Since this class is meant to be copied between CPU to GPU, it maintains a 76f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// value semantics. 77f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 78f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For example: To use this class and populate an array of 1024 randoms on CPU 79f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// with two threads, 80f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 81f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { 82f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// assert(start % 4 == 0); 83f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// assert(limit % 4 == 0); 84f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// rnd.Skip(start / 4); 85f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// for (int i = start; i < limit; i += 4) { 86f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// auto sample = rnd(); 87f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// ... copy sample[0..3] to output[i..i+3] 88f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// } 89f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// } 90f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 91f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// PhiloxRandom rng(seed); 92f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// PhiloxRandom rng_copy = rng; 93f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// rng.Skip(1000/4); 94f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 95f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; 96f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; 97f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// ... wait for thread 1 & 2 to finish executing Fill(). 98f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 99f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// NOTE: 100f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 1. PhiloxRandom is trivially copyable. 101f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 2. PhiloxRandom is compilable by gcc and nvcc. 102f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass PhiloxRandom { 103f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public: 104f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur typedef Array<uint32, 4> ResultType; 105f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur typedef uint32 ResultElementType; 106f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // The number of elements that will be returned. 107f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static const int kResultElementCount = 4; 1086855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower // Cost of generation of a single element (in cycles). 1096855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower static const int kElementCost = 10; 110f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 111f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE 112f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PhiloxRandom() {} 113f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 114f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE 115f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur explicit PhiloxRandom(uint64 seed) { 116f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur key_[0] = static_cast<uint32>(seed); 117f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur key_[1] = static_cast<uint32>(seed >> 32); 118f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 119f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 120f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE 121f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) { 122f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur key_[0] = static_cast<uint32>(seed_lo); 123f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur key_[1] = static_cast<uint32>(seed_lo >> 32); 124f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter_[2] = static_cast<uint32>(seed_hi); 125f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter_[3] = static_cast<uint32>(seed_hi >> 32); 126f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 127f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 128f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Skip the specified number of samples of 128-bits in the current stream. 129f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE 130f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur void Skip(uint64 count) { 131f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur const uint32 count_lo = static_cast<uint32>(count); 132f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32 count_hi = static_cast<uint32>(count >> 32); 133f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 134f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter_[0] += count_lo; 135f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (counter_[0] < count_lo) { 136f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ++count_hi; 137f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 138f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 139f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter_[1] += count_hi; 140f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (counter_[1] < count_hi) { 141f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (++counter_[2] == 0) { 142f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ++counter_[3]; 143f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 144f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 145f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 146f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 147f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Returns a group of four random numbers using the underlying Philox 148f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // algorithm. 149f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE ResultType operator()() { 150f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ResultType counter = counter_; 151f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur Key key = key_; 152f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 153f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Run the single rounds for ten times. Manually unrolling the loop 154f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // for better performance. 155f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 156f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 157f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 158f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 159f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 160f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 161f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 162f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 163f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 164f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 165f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 166f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 167f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 168f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 169f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 170f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 171f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 172f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur RaiseKey(&key); 173f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur counter = ComputeSingleRound(counter, key); 174f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 175f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur SkipOne(); 176f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 177f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur return counter; 178f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 179f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 180f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private: 181f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // The type for the 64-bit key stored in the form of two 32-bit uint 182f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // that are used in the diffusion process. 183f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur typedef Array<uint32, 2> Key; 184f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 185f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // We use the same constants as recommended by the original paper. 186f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static const uint32 kPhiloxW32A = 0x9E3779B9; 187f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static const uint32 kPhiloxW32B = 0xBB67AE85; 188f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static const uint32 kPhiloxM4x32A = 0xD2511F53; 189f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static const uint32 kPhiloxM4x32B = 0xCD9E8D57; 190f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 191f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Helper function to skip the next sample of 128-bits in the current stream. 192f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE void SkipOne() { 193f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (++counter_[0] == 0) { 194f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (++counter_[1] == 0) { 195f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur if (++counter_[2] == 0) { 196f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ++counter_[3]; 197f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 198f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 199f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 200f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 201f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 202f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Helper function to return the lower and higher 32-bits from two 32-bit 203f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // integer multiplications. 204f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE 205f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low, 206f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32* result_high) { 207895a0a423bca9118faa66fb144d434e5c06751ccA. Unique TensorFlower#ifndef __CUDA_ARCH__ 208f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur const uint64 product = static_cast<uint64>(a) * b; 209f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur *result_low = static_cast<uint32>(product); 210f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur *result_high = static_cast<uint32>(product >> 32); 211f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else 212f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur *result_low = a * b; 213f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur *result_high = __umulhi(a, b); 214f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif 215f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 216f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 217f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur // Helper function for a single round of the underlying Philox algorithm. 218f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound( 219f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur const ResultType& counter, const Key& key) { 220f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32 lo0; 221f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32 hi0; 222f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); 223f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 224f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32 lo1; 225f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur uint32 hi1; 226f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); 227f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 228f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ResultType result; 229f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur result[0] = hi1 ^ counter[1] ^ key[0]; 230f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur result[1] = lo1; 231f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur result[2] = hi0 ^ counter[3] ^ key[1]; 232f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur result[3] = lo0; 233f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur return result; 234f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 235f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 236f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur PHILOX_DEVICE_INLINE void RaiseKey(Key* key) { 237f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur (*key)[0] += kPhiloxW32A; 238f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur (*key)[1] += kPhiloxW32B; 239f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur } 240f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 241f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private: 242f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur ResultType counter_; 243f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur Key key_; 244f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}; 245f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 246f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur} // namespace random 247f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur} // namespace tensorflow 248f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur 249f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_ 250