1abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
3abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License");
4abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFloweryou may not use this file except in compliance with the License.
5abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerYou may obtain a copy of the License at
6abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
7abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    http://www.apache.org/licenses/LICENSE-2.0
8abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
9abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software
10abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS,
11abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerSee the License for the specific language governing permissions and
13abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerlimitations under the License.
14abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower==============================================================================*/
15abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
16abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
17abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_
18abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
19abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower/**
20abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Wrappers and helpers for CUDA device code.
21abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower *
22abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide
23abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * backwards compatibility, see go/volta-porting for details.
24abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Provides atomic operations on types that aren't natively supported.
25abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower */
26abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
27abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if GOOGLE_CUDA
28abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
29abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include <algorithm>
30abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include <complex>
313d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
32abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include "cuda/include/cuda.h"
33abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include "tensorflow/core/platform/types.h"
34abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
35abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace tensorflow {
36abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
37abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail {
38abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
39abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper for range-based for loop using 'delta' increments.
40abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: see CudaGridRange?() functions below.
41abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
42abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerclass CudaGridRange {
43abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  struct Iterator {
44abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
45abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    __device__ T operator*() const { return index_; }
46abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    __device__ Iterator& operator++() {
47abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      index_ += delta_;
48abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      return *this;
49abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    }
50abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    __device__ bool operator!=(const Iterator& other) const {
51abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      bool greater = index_ > other.index_;
52abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      bool less = index_ < other.index_;
53abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      // Anything past an end iterator (delta_ == 0) is equal.
54abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      // In range-based for loops, this optimizes to 'return less'.
55abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      if (!other.delta_) {
56abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower        return less;
57abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      }
58abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      if (!delta_) {
59abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower        return greater;
60abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      }
61abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      return less || greater;
62abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    }
63abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
64abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower   private:
65abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    T index_;
66abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    const T delta_;
67abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  };
68abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
69abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower public:
70abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  __device__ CudaGridRange(T begin, T delta, T end)
71abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      : begin_(begin), delta_(delta), end_(end) {}
72abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
73abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  __device__ Iterator begin() const { return Iterator{begin_, delta_}; }
74abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  __device__ Iterator end() const { return Iterator{end_, 0}; }
75abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
76abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower private:
77abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  T begin_;
78abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  T delta_;
79abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  T end_;
80abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower};
81abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
82abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}  // namespace detail
83abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
84abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count, using the x-coordinate
85abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// of the global thread index. That is, each index i is visited by all threads
86abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// with the same x-coordinate.
87abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeX(count)) { visit(i); }
88abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
89abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) {
90abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x,
91abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                  gridDim.x * blockDim.x, count);
92abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
93abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
94abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
95abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeY(count)) { visit(i); }
96abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
97abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) {
98abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y,
99abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                  gridDim.y * blockDim.y, count);
100abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
101abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
102abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
103abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); }
104abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
105abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) {
106abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z,
107abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                  gridDim.z * blockDim.z, count);
108abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
109abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
110abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Mask for all 32 threads in a warp.
111abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerconst unsigned kCudaWarpAll = 0xffffffff;
112abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
113abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the warp lane ID of the calling thread
114abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaLaneId() {
115abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned int lane_id;
116abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm("mov.u32 %0, %%laneid;" : "=r"(lane_id));
117abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return lane_id;
118abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
119abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
120abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail {
121abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns true if mask is a valid parameter for __shfl*sync to return a well
122abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// defined value, assuming the calling lane will read from src_lane as part of
123abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// the shuffle operation.
124abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower//
125abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Specifically, returns true iff mask has the calling lane bit and the src_lane
126abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// bit set, and the src_lane calls this function with the same mask value
127abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// (required for the two threads to wait for each other).
128abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower//
129abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// On Volta, for some invalid masks, this function hangs or returns false
130abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// positives, because the implementation shuffles with the same mask that
131abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// we are validating. Run on Pascal if you suspect that the mask is incorrect.
132abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask,
133abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                                   unsigned src_lane) {
134abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane;
135abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
136abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane);
137abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
138abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned src_lane_mask = __shfl(mask, src_lane);
139abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
140abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask;
141abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
142abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
143abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the actual source lane for shuffle.
144abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) {
145abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  int lane_id = CudaLaneId();
146abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  int lane_base = lane_id & ~width + 1;
147abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  int lane_offset = src_lane & width - 1;
148abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return lane_base + lane_offset;
149abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
150abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
151abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle up.
152abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) {
153abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lane_id = CudaLaneId();
154abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  if ((lane_id & width - 1) < delta) {
155abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    return lane_id;
156abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  }
157abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return lane_id - delta;
158abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
159abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
160abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle down.
161abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta,
162abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                                     int width) {
163abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lane_id = CudaLaneId();
164abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  if ((lane_id & width - 1) + delta >= width) {
165abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    return lane_id;
166abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  }
167abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return lane_id + delta;
168abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
169abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
170abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle xor.
171abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) {
172abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  int lane_id = CudaLaneId();
173abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  int src_lane = lane_id ^ lane_mask;
174abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  if (src_lane > (lane_id | width - 1)) {
175abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    return lane_id;
176abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  }
177abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return src_lane;
178abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
179abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}  // namespace detail
180abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
181abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// For all *_sync wrappers below, it is illegal to synchronize threads from
182abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// different program locations, because that is not supported before sm_70.
183abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// In other words, all threads in 'mask' must call the functions in convergence.
184abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly.
185abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower//
186abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// It is also illegal to shuffle with a mask that produces an undefined result
187abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// for any of the threads. Specifically, all source threads of the shuffle
188abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// must have their corresponding bit in 'mask' set.
189abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
190abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __syncwarp. No-op for CUDA 8 and earlier.
191abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) {
192abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(mask & 1u << CudaLaneId());
193abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
194abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  __syncwarp(mask);
195abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
196abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
197abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
198abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __ballot_sync. All threads in 'mask' must call this function in
199abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
200abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) {
201abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(mask & 1u << CudaLaneId());
202abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
203abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __ballot_sync(mask, pred);
204abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
205abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __ballot(pred) & mask;  // Apply mask to match __ballot_sync's spec.
206abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
207abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
208abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
209abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __any_sync. All threads in 'mask' must call this function in
210abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
211abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline int CudaAnySync(unsigned mask, int pred) {
212abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(mask & 1u << CudaLaneId());
213abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
214abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __any_sync(mask, pred);
215abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
216abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __any(pred);
217abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
218abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
219abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
220abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __all_sync. All threads in 'mask' must call this function in
221abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
222abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline int CudaAllSync(unsigned mask, int pred) {
223abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(mask & 1u << CudaLaneId());
224abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
225abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __all_sync(mask, pred);
226abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
227abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __all(pred);
228abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
229abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
230abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
231abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_sync. All threads in 'mask' must call this function in
232abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
233abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
234abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane,
235abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                             int width = warpSize) {
236abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(!(width & width - 1));
237abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(detail::CudaValidateShuffleSyncMask(
238abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      mask, detail::CudaShuffleGetSrcLane(src_lane, width)));
239abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
240abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_sync(mask, value, src_lane, width);
241abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
242abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl(value, src_lane, width);
243abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
244abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
245abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
246abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
247abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example).
248abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944.
249abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleSync(unsigned mask, double value,
250abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                         int src_lane, int width = warpSize) {
251abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lo, hi;
252abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
253abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  hi = CudaShuffleSync(mask, hi, src_lane, width);
254abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  lo = CudaShuffleSync(mask, lo, src_lane, width);
255abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
256abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return value;
257abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
258abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
259abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in
260abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
261abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
262abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta,
263abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                      int width = warpSize) {
264abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(!(width & width - 1));
265abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(detail::CudaValidateShuffleSyncMask(
266abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      mask, detail::CudaShuffleUpGetSrcLane(delta, width)));
267abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
268abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_up_sync(mask, value, delta, width);
269abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
270abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_up(value, delta, width);
271abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
272abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
273abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
274abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
275abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example).
276abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944.
277abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleUpSync(unsigned mask, double value,
278abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                           unsigned delta,
279abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                           int width = warpSize) {
280abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lo, hi;
281abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
282abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  hi = CudaShuffleUpSync(mask, hi, delta, width);
283abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  lo = CudaShuffleUpSync(mask, lo, delta, width);
284abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
285abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return value;
286abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
287abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
288abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function
289abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// in convergence, see comment above for details.
290abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
291abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta,
292abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                        int width = warpSize) {
293abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(!(width & width - 1));
294abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(detail::CudaValidateShuffleSyncMask(
295abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      mask, detail::CudaShuffleDownGetSrcLane(delta, width)));
296abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
297abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_down_sync(mask, value, delta, width);
298abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
299abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_down(value, delta, width);
300abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
301abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
302abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
303abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
304abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example).
305abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944.
306abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleDownSync(unsigned mask, double value,
307abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                             unsigned delta,
308abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                             int width = warpSize) {
309abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lo, hi;
310abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
311abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  hi = CudaShuffleDownSync(mask, hi, delta, width);
312abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  lo = CudaShuffleDownSync(mask, lo, delta, width);
313abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
314abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return value;
315abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
316abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
317abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in
318abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details.
319abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
320abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask,
321abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                int width = warpSize) {
322abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(!(width & width - 1));
323abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(detail::CudaValidateShuffleSyncMask(
324abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width)));
325abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000
326abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_xor_sync(mask, value, lane_mask, width);
327abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
328abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __shfl_xor(value, lane_mask, width);
329abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
330abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
331abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
332abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned
333abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example).
334abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944.
335abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleXorSync(unsigned mask, double value,
336abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                            int lane_mask,
337abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                            int width = warpSize) {
338abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  unsigned lo, hi;
339abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value));
340abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  hi = CudaShuffleXorSync(mask, hi, lane_mask, width);
341abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  lo = CudaShuffleXorSync(mask, lo, lane_mask, width);
342abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi));
343abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return value;
344abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
345abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
346abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __ldg.
347abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
348abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ T CudaLdg(const T* address) {
349abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350
350abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __ldg(address);
351abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
352abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return *address;
353abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
354abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
355abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
356abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline bool CudaLdg(const bool* address) {
357abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return CudaLdg(reinterpret_cast<const char*>(address)) != 0;
358abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
359abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
360abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline std::complex<float> CudaLdg(
361abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    const std::complex<float>* address) {
362abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350
363abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  float2 mem = __ldg(reinterpret_cast<const float2*>(address));
364abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return std::complex<float>(mem.x, mem.y);
365abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
366abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return *address;
367abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
368abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
369abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
370abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline std::complex<double> CudaLdg(
371abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    const std::complex<double>* address) {
372abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350
373abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  double2 mem = __ldg(reinterpret_cast<const double2*>(address));
374abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return std::complex<double>(mem.x, mem.y);
375abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else
376abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return *address;
377abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
378abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
379abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
380abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Zeroes count elements starting at ptr using all threads of a 1-D grid.
381abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Note: this function does not synchronize, and therefore the memory range is
382abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// not guaranteed to be zero until the next kernel launch.
383abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T>
384abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__global__ void SetZero(const int count, T* ptr) {
385abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  // Check that the grid is one dimensional and index doesn't overflow.
386abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(blockDim.y == 1 && blockDim.z == 1);
387abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
388abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  for (int i : CudaGridRangeX(count)) {
389abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    ptr[i] = T(0);
390abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  }
391abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
392abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
3933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Helper to set all tensor entries to a specific value.
3943d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename T>
3953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__global__ void SetToValue(const int count, T* ptr, T value) {
3963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  // Check that the grid is one dimensional and index doesn't overflow.
3973d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  assert(blockDim.y == 1 && blockDim.z == 1);
3983d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
3993d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  for (int i : CudaGridRangeX(count)) {
4003d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    ptr[i] = value;
4013d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  }
4023d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
4033d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
404abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail {
405abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper function for atomic accumulation implemented as CAS.
406abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename F>
407abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) {
408abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  T old = *ptr;
409abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  T assumed;
410abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  do {
411abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    assumed = old;
412abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    old = atomicCAS(ptr, assumed, accumulate(assumed));
413abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  } while (assumed != old);
414abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return old;
415abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
416abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
417abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Overload for floating point (using integer comparison to handle NaN
418abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// correctly).
419abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename F>
420abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) {
421abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __float_as_int(
422abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) {
423abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower        return __float_as_int(accumulate(__int_as_float(a)));
424abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      }));
425abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
426abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename F>
427abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) {
428abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return __longlong_as_double(CudaAtomicCasHelper(
429abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      reinterpret_cast<tensorflow::uint64*>(ptr),
430abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      [accumulate](tensorflow::uint64 a) {
431abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower        return __double_as_longlong(accumulate(__longlong_as_double(a)));
432abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      }));
433abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
434abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
4353d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Overload of above function for half. Note that we don't have
4363d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// atomicCAS() for anything less than 32 bits, so we need to include the
4373d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// other 16 bits in the operation.
4383d86d8ce14989ca65a59ad4cf37f690694bf6267Phil//
4393d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// This version is going to be very slow
4403d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// under high concurrency, since most threads will be spinning on failing
4413d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// their compare-and-swap tests. (The fact that we get false sharing on the
4423d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// neighboring fp16 makes this even worse.) If you are doing a large reduction,
4433d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// you are much better off with doing the intermediate steps in fp32 and then
4443d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// switching to fp16 as late as you can in the calculations.
4453d86d8ce14989ca65a59ad4cf37f690694bf6267Phil//
4463d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Note: Assumes little endian.
4473d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename F>
4483d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) {
4493d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__)
4503d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian");
4513d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#endif
4523d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  namespace half_impl = Eigen::half_impl;
4533d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  intptr_t intptr = reinterpret_cast<intptr_t>(ptr);
4543d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  assert(!(intptr & 0x1));  // should be 2-aligned.
4553d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  if (intptr & 0x2) {
4563d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    // The half is in the second part of the uint32 (upper 16 bits).
4573d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    uint32* address = reinterpret_cast<uint32*>(intptr - 2);
4583d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
4593d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      unsigned short high = static_cast<unsigned short>(arg >> 16);
4603d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high));
4613d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff);
4623d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    });
4633d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16));
4643d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  } else {
4653d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    // The half is in the first part of the uint32 (lower 16 bits).
4663d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    uint32* address = reinterpret_cast<uint32*>(intptr);
4673d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) {
4683d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      unsigned short low = static_cast<unsigned short>(arg & 0xffff);
4693d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low));
4703d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      return (arg & 0xffff0000) | static_cast<uint32>(acc.x);
4713d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    });
4723d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff));
4733d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  }
4743d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
4753d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
476abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename From, typename To>
477abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerusing ToTypeIfConvertible =
478abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
479abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
480abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}  // namespace detail
481abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
482abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// CUDA provides atomic ops, but not for all types.  We provide wrappers
483abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// for some ops and provide implementation for all reasonable types.
484abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
485abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U>
486abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) {
487abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return atomicAdd(ptr, value);
488abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
4893d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
4903d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr,
4913d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                            Eigen::half value) {
4923d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
4933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](Eigen::half a) { return a + value; });
4943d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
4953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
4963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
497abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ < 600
498abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicAdd(double* ptr, double value) {
499abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaAtomicCasHelper(ptr,
500abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                     [value](double a) { return a + value; });
501abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
502abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#elif __clang__
503abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX.
504abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// see https://reviews.llvm.org/D39638
505abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicAdd(double* ptr, double value) {
506abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  double result;
507abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  asm volatile("atom.add.f64 %0, [%1], %2;"
508abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower               : "=d"(result)
509abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower               : "l"(ptr), "d"(value)
510abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower               : "memory");
511abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return result;
512abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
513abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
5143d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicAdd
5153d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does
5163d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// not support. We treat a std::complex<T>* as a T* (the C++ standard section
5173d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// 26.4.4 allows this explicitly) and atomic add the real and imaginary
5183d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// components individually. The operation as a whole is not atomic, but we can
5193d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// safely treat the components independently for the purpose of accumulating.
5203d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr,
5213d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                                    std::complex<float> value) {
5223d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  auto ptr_scalar = reinterpret_cast<float*>(ptr);
5233d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()),
5243d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                             CudaAtomicAdd(ptr_scalar + 1, value.imag()));
5253d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5263d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5273d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline std::complex<double> CudaAtomicAdd(
5283d86d8ce14989ca65a59ad4cf37f690694bf6267Phil    std::complex<double>* ptr, std::complex<double> value) {
5293d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  auto ptr_scalar = reinterpret_cast<double*>(ptr);
5303d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()),
5313d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                              CudaAtomicAdd(ptr_scalar + 1, value.imag()));
5323d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5333d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5343d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicSub
535abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U>
536abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) {
537abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return atomicSub(ptr, value);
538abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
5393d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
540abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Specializations of substraction which add the negative value.
541abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline float CudaAtomicSub(float* ptr, float value) {
542abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return CudaAtomicAdd(ptr, -value);
543abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
5443d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
545abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicSub(double* ptr, double value) {
546abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return CudaAtomicAdd(ptr, -value);
547abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
5483d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
549abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr,
550abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                                   tensorflow::uint64 value) {
551abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return CudaAtomicAdd(ptr, -value);
552abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
553abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
5543d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr,
5553d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                            Eigen::half value) {
5563d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
5573d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](Eigen::half a) { return a - value; });
5583d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5593d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5603d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMax
561abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U>
562abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) {
563abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return atomicMax(ptr, value);
564abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
5653d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5663d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline float CudaAtomicMax(float* ptr, float value) {
5673d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
5683d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](float a) { return max(a, value); });
5693d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5703d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5713d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline double CudaAtomicMax(double* ptr, double value) {
5723d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
5733d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](double a) { return max(a, value); });
5743d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5753d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5763d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr,
5773d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                            Eigen::half value) {
5783d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
5793d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](Eigen::half a) { return max(a, value); });
5803d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5813d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
582abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ < 320
583abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr,
584abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower                                                   tensorflow::uint64 value) {
585abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaAtomicCasHelper(
586abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
587abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
588abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif
589abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
5903d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMin
5913d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename T, typename U>
5923d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) {
5933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return atomicMin(ptr, value);
5943d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
5953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
5963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline float CudaAtomicMin(float* ptr, float value) {
5973d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
5983d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](float a) { return min(a, value); });
5993d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
6003d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
6013d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline double CudaAtomicMin(double* ptr, double value) {
6023d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
6033d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](double a) { return min(a, value); });
6043d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
6053d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
6063d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr,
6073d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                            Eigen::half value) {
6083d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
6093d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](Eigen::half a) { return min(a, value); });
6103d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
6113d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
6123d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#if __CUDA_ARCH__ < 320
6133d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr,
6143d86d8ce14989ca65a59ad4cf37f690694bf6267Phil                                                   tensorflow::uint64 value) {
6153d86d8ce14989ca65a59ad4cf37f690694bf6267Phil  return detail::CudaAtomicCasHelper(
6163d86d8ce14989ca65a59ad4cf37f690694bf6267Phil      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
6173d86d8ce14989ca65a59ad4cf37f690694bf6267Phil}
6183d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#endif
6193d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
6203d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMul
621abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U>
622abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) {
623abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; });
624abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
6253d86d8ce14989ca65a59ad4cf37f690694bf6267Phil
6263d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicDiv
627abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U>
628abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) {
629abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower  return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; });
630abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}
631abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
632abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}  // namespace tensorflow
633abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower
634abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif  // GOOGLE_CUDA
635abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif  // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_
636