1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
29c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
39c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurLicensed under the Apache License, Version 2.0 (the "License");
49c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudluryou may not use this file except in compliance with the License.
59c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurYou may obtain a copy of the License at
69c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
79c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur    http://www.apache.org/licenses/LICENSE-2.0
89c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
99c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurUnless required by applicable law or agreed to in writing, software
109c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurdistributed under the License is distributed on an "AS IS" BASIS,
119c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
129c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath KudlurSee the License for the specific language governing permissions and
139c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlurlimitations under the License.
149c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur==============================================================================*/
159c3043ff3bf31a6a81810b4ce9e87ef936f1f529Manjunath Kudlur
16f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Implement the Philox algorithm to generate random numbers in parallel.
17f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
18f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
19f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
20f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#ifndef TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
21f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
22f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
23f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <stdlib.h>
24f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
255a24d3a2514698b0ae11563b2ea21e368de48a4fJosh Levenberg#include "tensorflow/core/platform/types.h"
26f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
27f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Function qualifiers that need to work on both CPU and GPU.
28e1ac9aea1ca0a5aac83fa5b0f1e0929e412e44bdA. Unique TensorFlower#if defined(__CUDACC__)
29f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For nvcc.
30f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC __host__ __device__
31f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE __inline__
32f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else
33f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For non-nvcc.
34f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_FUNC
35f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_INLINE inline
36f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif
37f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
38f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
39f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#include <math.h>
40f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
41f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace tensorflow {
42f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurnamespace random {
43f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
44f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that represents an inline array. It can be used on both CPU and GPU,
45f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// and also trivially copyable between CPU and GPU.
46f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Arguments:
47f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   T: the array element type;
48f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//   ElementCount: the fixed size of the array;
49f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurtemplate <typename T, int ElementCount>
50f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass Array {
51f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public:
52f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE Array() {
53f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    for (int i = 0; i < ElementCount; ++i) {
54529e29712e681aefbf08539b6fae50fafdae8cc3Benoit Steiner      data_[i] = T(0);
55f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
56f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
57f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
58f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE const T& operator[](int index) const {
59f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return data_[index];
60f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
61f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
62f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE T& operator[](int index) { return data_[index]; }
63f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
64f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  size_t size() const { return ElementCount; }
65f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
66f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
67f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  T data_[ElementCount];
68f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur};
69f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
70f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// A class that encapsulates all the states for a random number generator using
71f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
72f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// in the form of four uint32.
73f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// There are multiple variants of this algorithm, we picked the 4x32_10 version
74f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// that is most suited for our applications.
75f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// Since this class is meant to be copied between CPU to GPU, it maintains a
76f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// value semantics.
77f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
78f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// For example: To use this class and populate an array of 1024 randoms on CPU
79f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// with two threads,
80f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
81f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
82f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    assert(start % 4 == 0);
83f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    assert(limit % 4 == 0);
84f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    rnd.Skip(start / 4);
85f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    for (int i = start; i < limit; i += 4) {
86f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//      auto sample = rnd();
87f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//      ... copy sample[0..3] to output[i..i+3]
88f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//    }
89f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  }
90f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
91f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  PhiloxRandom rng(seed);
92f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  PhiloxRandom rng_copy = rng;
93f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  rng.Skip(1000/4);
94f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
95f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
96f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
97f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//  ... wait for thread 1 & 2 to finish executing Fill().
98f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur//
99f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// NOTE:
100f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 1. PhiloxRandom is trivially copyable.
101f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur// 2. PhiloxRandom is compilable by gcc and nvcc.
102f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlurclass PhiloxRandom {
103f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur public:
10452dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  using ResultType = Array<uint32, 4>;
10552dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  using ResultElementType = uint32;
106f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // The number of elements that will be returned.
107f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const int kResultElementCount = 4;
1086855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower  // Cost of generation of a single element (in cycles).
1096855bcc08cfcbba1a20699b3d53458a490cde2a8A. Unique TensorFlower  static const int kElementCost = 10;
11052dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  // The type for the 64-bit key stored in the form of two 32-bit uint
11152dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  // that are used in the diffusion process.
11252dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  using Key = Array<uint32, 2>;
113f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
114f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
115f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PhiloxRandom() {}
116f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
117f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
118f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  explicit PhiloxRandom(uint64 seed) {
119f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[0] = static_cast<uint32>(seed);
120f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[1] = static_cast<uint32>(seed >> 32);
121f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
122f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
123f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
124f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  explicit PhiloxRandom(uint64 seed_lo, uint64 seed_hi) {
125f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[0] = static_cast<uint32>(seed_lo);
126f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    key_[1] = static_cast<uint32>(seed_lo >> 32);
127f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[2] = static_cast<uint32>(seed_hi);
128f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[3] = static_cast<uint32>(seed_hi >> 32);
129f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
130f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
13152dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  PHILOX_DEVICE_INLINE
13252dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
13352dcb2590bb9274262656c958c105cb5e5cc1300Rohan Jain
134f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Skip the specified number of samples of 128-bits in the current stream.
135f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
136f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  void Skip(uint64 count) {
137f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    const uint32 count_lo = static_cast<uint32>(count);
138f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 count_hi = static_cast<uint32>(count >> 32);
139f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
140f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[0] += count_lo;
141f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (counter_[0] < count_lo) {
142f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      ++count_hi;
143f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
144f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
145f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter_[1] += count_hi;
146f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (counter_[1] < count_hi) {
147f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      if (++counter_[2] == 0) {
148f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        ++counter_[3];
149f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      }
150f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
151f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
152f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
153f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Returns a group of four random numbers using the underlying Philox
154f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // algorithm.
155f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE ResultType operator()() {
156f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    ResultType counter = counter_;
157f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    Key key = key_;
158f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
159f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    // Run the single rounds for ten times. Manually unrolling the loop
160f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    // for better performance.
161f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
162f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
163f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
164f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
165f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
166f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
167f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
168f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
169f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
170f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
171f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
172f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
173f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
174f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
175f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
176f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
177f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
178f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    RaiseKey(&key);
179f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    counter = ComputeSingleRound(counter, key);
180f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
181f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    SkipOne();
182f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
183f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return counter;
184f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
185f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
186f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
187f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // We use the same constants as recommended by the original paper.
188f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxW32A = 0x9E3779B9;
189f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxW32B = 0xBB67AE85;
190f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxM4x32A = 0xD2511F53;
191f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static const uint32 kPhiloxM4x32B = 0xCD9E8D57;
192f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
193f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function to skip the next sample of 128-bits in the current stream.
194f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE void SkipOne() {
195f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    if (++counter_[0] == 0) {
196f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      if (++counter_[1] == 0) {
197f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        if (++counter_[2] == 0) {
198f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur          ++counter_[3];
199f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur        }
200f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      }
201f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    }
202f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
203f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
204f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function to return the lower and higher 32-bits from two 32-bit
205f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // integer multiplications.
206f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE
207f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  static void MultiplyHighLow(uint32 a, uint32 b, uint32* result_low,
208f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur                              uint32* result_high) {
209895a0a423bca9118faa66fb144d434e5c06751ccA. Unique TensorFlower#ifndef __CUDA_ARCH__
210f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    const uint64 product = static_cast<uint64>(a) * b;
211f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_low = static_cast<uint32>(product);
212f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_high = static_cast<uint32>(product >> 32);
213f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#else
214f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_low = a * b;
215f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    *result_high = __umulhi(a, b);
216f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif
217f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
218f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
219f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  // Helper function for a single round of the underlying Philox algorithm.
220f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(
221f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur      const ResultType& counter, const Key& key) {
222f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 lo0;
223f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 hi0;
224f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
225f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
226f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 lo1;
227f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    uint32 hi1;
228f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
229f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
230f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    ResultType result;
231f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[0] = hi1 ^ counter[1] ^ key[0];
232f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[1] = lo1;
233f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[2] = hi0 ^ counter[3] ^ key[1];
234f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    result[3] = lo0;
235f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    return result;
236f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
237f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
238f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  PHILOX_DEVICE_INLINE void RaiseKey(Key* key) {
239f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    (*key)[0] += kPhiloxW32A;
240f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur    (*key)[1] += kPhiloxW32B;
241f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  }
242f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
243f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur private:
244f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  ResultType counter_;
245f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur  Key key_;
246f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur};
247f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
248f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}  // namespace random
249f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur}  // namespace tensorflow
250f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur
251f41959ccb2d9d4c722fe8fc3351401d53bcf490Manjunath Kudlur#endif  // TENSORFLOW_LIB_RANDOM_PHILOX_RANDOM_H_
252