philox_random.h revision 895a0a423bca9118faa66fb144d434e5c06751cc
19c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur/* Copyright 2015 Google Inc. All Rights Reserved.
29c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
39c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurLicensed under the Apache License, Version 2.0 (the "License");
49c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudluryou may not use this file except in compliance with the License.
59c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurYou may obtain a copy of the License at
69c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
79c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur    http://www.apache.org/licenses/LICENSE-2.0
89c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
99c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurUnless required by applicable law or agreed to in writing, software
109c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurdistributed under the License is distributed on an "AS IS" BASIS,
119c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
129c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurSee the License for the specific language governing permissions and
139c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurlimitations under the License.
149c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur==============================================================================*/
159c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
16f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Implement the Philox algorithm to generate random numbers in parallel.
17f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
18f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
19f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
20f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
21f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
22f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
23f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <stdlib.h>
24f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
255a24d3a2514698b0ae11563b2ea21e368de48a4fJosh Levenberg#include "tensorflow/core/platform/types.h"
26f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
27f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Function qualifiers that need to work on both CPU and GPU.
28e1ac9aea1ca0a5aac83fa5b0f1e0929e412e44bdA. Unique TensorFlower#if defined(__CUDACC__)
29f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For nvcc.
30f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC __host__ __device__
31f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE __inline__
32f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else
33f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For non-nvcc.
34f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC
35f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE inline
36f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif
37f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
38f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
39f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <math.h>
40f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
41f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace tensorflow {
42f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace random {
43f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
44f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that represents an inline array. It can be used on both CPU and GPU,
45f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// and also trivially copyable between CPU and GPU.
46f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Arguments:
47f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   T: the array element type;
48f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   ElementCount: the fixed size of the array;
49f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurtemplate <typename T, int ElementCount>
50f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass Array {
51f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public:
52f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE Array() {
53f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    for (int i = 0; i < ElementCount; ++i) {
54f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      data_[i] = T();
55f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
56f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
57f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
58f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE const T& operator[](int index) const {
59f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return data_[index];
60f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
61f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
62f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; }
63f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
64f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  size_t size() const { return ElementCount; }
65f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
66f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
67f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  T data_[ElementCount];
68f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur};
69f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
70f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that encapsulates all the states for a random number generator using
71f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
72f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// in the form of four uint32.
73f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// There are multiple variants of this algorithm, we picked the 4x32_10 version
74f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// that is most suited for our applications.
75f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Since this class is meant to be copied between CPU to GPU, it maintains a
76f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// value semantics.
77f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
78f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For example: To use this class and populate an array of 1024 randoms on CPU
79f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// with two threads,
80f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
81f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
82f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    assert(start % 4 == 0);
83f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    assert(limit % 4 == 0);
84f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    rnd.Skip(start / 4);
85f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    for (int i = start; i < limit; i += 4) {
86f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//      auto sample = rnd();
87f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//      ... copy sample[0..3] to output[i..i+3]
88f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    }
89f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  }
90f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
91f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  PhiloxRandom rng(seed);
92f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  PhiloxRandom rng_copy = rng;
93f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  rng.Skip(1000/4);
94f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
95f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
96f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
97f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... wait for thread 1 & 2 to finish executing Fill().
98f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
99f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// NOTE:
100f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 1. PhiloxRandom is trivially copyable.
101f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 2. PhiloxRandom is compilable by gcc and nvcc.
102f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass PhiloxRandom {
103f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public:
104f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  typedef Array<uint32, 4> ResultType;
105f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  typedef uint32 ResultElementType;
106f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // The number of elements that will be returned.
107f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const int kResultElementCount = 4;
1086855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower  // Cost of generation of a single element (in cycles).
1096855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower  static const int kElementCost = 10;
110f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
111f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
112f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PhiloxRandom() {}
113f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
114f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
115f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  explicit PhiloxRandom(uint64 seed) {
116f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[0] = static_cast<uint32>(seed);
117f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[1] = static_cast<uint32>(seed >> 32);
118f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
119f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
120f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
121f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) {
122f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[0] = static_cast<uint32>(seed_lo);
123f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[1] = static_cast<uint32>(seed_lo >> 32);
124f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[2] = static_cast<uint32>(seed_hi);
125f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[3] = static_cast<uint32>(seed_hi >> 32);
126f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
127f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
128f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Skip the specified number of samples of 128-bits in the current stream.
129f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
130f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  void Skip(uint64 count) {
131f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    const uint32 count_lo = static_cast<uint32>(count);
132f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 count_hi = static_cast<uint32>(count >> 32);
133f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
134f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[0] += count_lo;
135f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (counter_[0] < count_lo) {
136f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      ++count_hi;
137f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
138f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
139f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[1] += count_hi;
140f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (counter_[1] < count_hi) {
141f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      if (++counter_[2] == 0) {
142f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        ++counter_[3];
143f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      }
144f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
145f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
146f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
147f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Returns a group of four random numbers using the underlying Philox
148f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // algorithm.
149f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE ResultType operator()() {
150f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    ResultType counter = counter_;
151f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    Key key = key_;
152f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
153f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    // Run the single rounds for ten times. Manually unrolling the loop
154f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    // for better performance.
155f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
156f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
157f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
158f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
159f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
160f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
161f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
162f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
163f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
164f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
165f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
166f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
167f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
168f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
169f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
170f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
171f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
172f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
173f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
174f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
175f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    SkipOne();
176f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
177f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return counter;
178f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
179f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
180f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
181f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // The type for the 64-bit key stored in the form of two 32-bit uint
182f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // that are used in the diffusion process.
183f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  typedef Array<uint32, 2> Key;
184f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
185f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // We use the same constants as recommended by the original paper.
186f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxW32A = 0x9E3779B9;
187f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxW32B = 0xBB67AE85;
188f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxM4x32A = 0xD2511F53;
189f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxM4x32B = 0xCD9E8D57;
190f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
191f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function to skip the next sample of 128-bits in the current stream.
192f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE void SkipOne() {
193f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (++counter_[0] == 0) {
194f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      if (++counter_[1] == 0) {
195f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        if (++counter_[2] == 0) {
196f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur          ++counter_[3];
197f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        }
198f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      }
199f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
200f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
201f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
202f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function to return the lower and higher 32-bits from two 32-bit
203f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // integer multiplications.
204f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
205f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low,
206f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur                              uint32* result_high) {
207895a0a423bca9118faa66fb144d434e5c06751ccA. Unique TensorFlower#ifndef __CUDA_ARCH__
208f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    const uint64 product = static_cast<uint64>(a) * b;
209f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_low = static_cast<uint32>(product);
210f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_high = static_cast<uint32>(product >> 32);
211f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else
212f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_low = a * b;
213f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_high = __umulhi(a, b);
214f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif
215f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
216f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
217f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function for a single round of the underlying Philox algorithm.
218f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(
219f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      const ResultType& counter, const Key& key) {
220f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 lo0;
221f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 hi0;
222f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
223f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
224f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 lo1;
225f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 hi1;
226f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
227f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
228f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    ResultType result;
229f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[0] = hi1 ^ counter[1] ^ key[0];
230f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[1] = lo1;
231f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[2] = hi0 ^ counter[3] ^ key[1];
232f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[3] = lo0;
233f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return result;
234f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
235f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
236f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE void RaiseKey(Key* key) {
237f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    (*key)[0] += kPhiloxW32A;
238f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    (*key)[1] += kPhiloxW32B;
239f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
240f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
241f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
242f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  ResultType counter_;
243f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  Key key_;
244f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur};
245f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
246f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}  // namespace random
247f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}  // namespace tensorflow
248f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
249f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif  // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
250