1abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 3abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerLicensed under the Apache License, Version 2.0 (the "License"); 4abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFloweryou may not use this file except in compliance with the License. 5abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerYou may obtain a copy of the License at 6abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 7abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower http://www.apache.org/licenses/LICENSE-2.0 8abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 9abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerUnless required by applicable law or agreed to in writing, software 10abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerdistributed under the License is distributed on an "AS IS" BASIS, 11abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerSee the License for the specific language governing permissions and 13abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerlimitations under the License. 14abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower==============================================================================*/ 15abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 16abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#ifndef TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_ 17abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#define TENSORFLOW_CORE_UTIL_CUDA_DEVICE_FUNCTIONS_H_ 18abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 19abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower/** 20abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Wrappers and helpers for CUDA device code. 21abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * 22abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Wraps the warp-cooperative intrinsics introduced in CUDA 9 to provide 23abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * backwards compatibility, see go/volta-porting for details. 24abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower * Provides atomic operations on types that aren't natively supported. 25abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower */ 26abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 27abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if GOOGLE_CUDA 28abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 29abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include <algorithm> 30abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include <complex> 313d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 32abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include "cuda/include/cuda.h" 33abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#include "tensorflow/core/platform/types.h" 34abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 35abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace tensorflow { 36abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 37abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail { 38abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 39abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper for range-based for loop using 'delta' increments. 40abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: see CudaGridRange?() functions below. 41abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 42abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerclass CudaGridRange { 43abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower struct Iterator { 44abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {} 45abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ T operator*() const { return index_; } 46abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ Iterator& operator++() { 47abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower index_ += delta_; 48abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return *this; 49abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 50abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ bool operator!=(const Iterator& other) const { 51abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower bool greater = index_ > other.index_; 52abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower bool less = index_ < other.index_; 53abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower // Anything past an end iterator (delta_ == 0) is equal. 54abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower // In range-based for loops, this optimizes to 'return less'. 55abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower if (!other.delta_) { 56abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return less; 57abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 58abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower if (!delta_) { 59abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return greater; 60abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 61abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return less || greater; 62abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 63abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 64abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower private: 65abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T index_; 66abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower const T delta_; 67abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower }; 68abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 69abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower public: 70abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ CudaGridRange(T begin, T delta, T end) 71abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower : begin_(begin), delta_(delta), end_(end) {} 72abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 73abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ Iterator begin() const { return Iterator{begin_, delta_}; } 74abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __device__ Iterator end() const { return Iterator{end_, 0}; } 75abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 76abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower private: 77abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T begin_; 78abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T delta_; 79abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T end_; 80abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower}; 81abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 82abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} // namespace detail 83abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 84abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count, using the x-coordinate 85abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// of the global thread index. That is, each index i is visited by all threads 86abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// with the same x-coordinate. 87abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeX(count)) { visit(i); } 88abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 89abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeX(T count) { 90abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaGridRange<T>(blockIdx.x * blockDim.x + threadIdx.x, 91abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower gridDim.x * blockDim.x, count); 92abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 93abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 94abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count using the y-coordinate. 95abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeY(count)) { visit(i); } 96abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 97abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeY(T count) { 98abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaGridRange<T>(blockIdx.y * blockDim.y + threadIdx.y, 99abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower gridDim.y * blockDim.y, count); 100abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 101abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 102abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper to visit indices in the range 0 <= i < count using the z-coordinate. 103abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Usage: for(int i : CudaGridRangeZ(count)) { visit(i); } 104abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 105abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::CudaGridRange<T> CudaGridRangeZ(T count) { 106abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaGridRange<T>(blockIdx.z * blockDim.z + threadIdx.z, 107abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower gridDim.z * blockDim.z, count); 108abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 109abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 110abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Mask for all 32 threads in a warp. 111abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerconst unsigned kCudaWarpAll = 0xffffffff; 112abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 113abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the warp lane ID of the calling thread 114abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaLaneId() { 115abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned int lane_id; 116abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm("mov.u32 %0, %%laneid;" : "=r"(lane_id)); 117abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id; 118abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 119abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 120abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail { 121abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns true if mask is a valid parameter for __shfl*sync to return a well 122abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// defined value, assuming the calling lane will read from src_lane as part of 123abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// the shuffle operation. 124abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// 125abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Specifically, returns true iff mask has the calling lane bit and the src_lane 126abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// bit set, and the src_lane calls this function with the same mask value 127abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// (required for the two threads to wait for each other). 128abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// 129abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// On Volta, for some invalid masks, this function hangs or returns false 130abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// positives, because the implementation shuffles with the same mask that 131abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// we are validating. Run on Pascal if you suspect that the mask is incorrect. 132abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline bool CudaValidateShuffleSyncMask(unsigned mask, 133abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned src_lane) { 134abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned src_dst_mask = 1u << CudaLaneId() | 1u << src_lane; 135abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 136abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned src_lane_mask = __shfl_sync(mask, mask, src_lane); 137abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 138abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned src_lane_mask = __shfl(mask, src_lane); 139abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 140abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return (src_dst_mask & ~mask) == 0 && src_lane_mask == mask; 141abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 142abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 143abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the actual source lane for shuffle. 144abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleGetSrcLane(int src_lane, int width) { 145abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int lane_id = CudaLaneId(); 146abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int lane_base = lane_id & ~width + 1; 147abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int lane_offset = src_lane & width - 1; 148abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_base + lane_offset; 149abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 150abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 151abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle up. 152abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleUpGetSrcLane(unsigned delta, int width) { 153abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lane_id = CudaLaneId(); 154abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower if ((lane_id & width - 1) < delta) { 155abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id; 156abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 157abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id - delta; 158abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 159abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 160abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle down. 161abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleDownGetSrcLane(unsigned delta, 162abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width) { 163abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lane_id = CudaLaneId(); 164abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower if ((lane_id & width - 1) + delta >= width) { 165abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id; 166abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 167abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id + delta; 168abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 169abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 170abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Returns the source lane for shuffle xor. 171abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaShuffleXorGetSrcLane(int lane_mask, int width) { 172abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int lane_id = CudaLaneId(); 173abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int src_lane = lane_id ^ lane_mask; 174abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower if (src_lane > (lane_id | width - 1)) { 175abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return lane_id; 176abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 177abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return src_lane; 178abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 179abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} // namespace detail 180abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 181abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// For all *_sync wrappers below, it is illegal to synchronize threads from 182abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// different program locations, because that is not supported before sm_70. 183abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// In other words, all threads in 'mask' must call the functions in convergence. 184abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Code that requires sm_70 (and CUDA 9) may use the intrinsic directly. 185abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// 186abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// It is also illegal to shuffle with a mask that produces an undefined result 187abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// for any of the threads. Specifically, all source threads of the shuffle 188abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// must have their corresponding bit in 'mask' set. 189abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 190abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __syncwarp. No-op for CUDA 8 and earlier. 191abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline void CudaSyncWarp(unsigned mask = kCudaWarpAll) { 192abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(mask & 1u << CudaLaneId()); 193abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 194abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower __syncwarp(mask); 195abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 196abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 197abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 198abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __ballot_sync. All threads in 'mask' must call this function in 199abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 200abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline unsigned CudaBallotSync(unsigned mask, int pred) { 201abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(mask & 1u << CudaLaneId()); 202abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 203abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __ballot_sync(mask, pred); 204abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 205abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __ballot(pred) & mask; // Apply mask to match __ballot_sync's spec. 206abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 207abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 208abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 209abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __any_sync. All threads in 'mask' must call this function in 210abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 211abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline int CudaAnySync(unsigned mask, int pred) { 212abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(mask & 1u << CudaLaneId()); 213abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 214abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __any_sync(mask, pred); 215abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 216abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __any(pred); 217abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 218abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 219abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 220abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __all_sync. All threads in 'mask' must call this function in 221abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 222abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline int CudaAllSync(unsigned mask, int pred) { 223abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(mask & 1u << CudaLaneId()); 224abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 225abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __all_sync(mask, pred); 226abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 227abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __all(pred); 228abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 229abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 230abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 231abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_sync. All threads in 'mask' must call this function in 232abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 233abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 234abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaShuffleSync(unsigned mask, T value, int src_lane, 235abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 236abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(!(width & width - 1)); 237abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(detail::CudaValidateShuffleSyncMask( 238abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower mask, detail::CudaShuffleGetSrcLane(src_lane, width))); 239abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 240abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_sync(mask, value, src_lane, width); 241abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 242abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl(value, src_lane, width); 243abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 244abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 245abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 246abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned 247abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example). 248abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944. 249abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleSync(unsigned mask, double value, 250abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int src_lane, int width = warpSize) { 251abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lo, hi; 252abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); 253abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower hi = CudaShuffleSync(mask, hi, src_lane, width); 254abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower lo = CudaShuffleSync(mask, lo, src_lane, width); 255abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); 256abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return value; 257abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 258abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 259abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_up_sync. All threads in 'mask' must call this function in 260abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 261abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 262abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline T CudaShuffleUpSync(unsigned mask, T value, unsigned delta, 263abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 264abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(!(width & width - 1)); 265abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(detail::CudaValidateShuffleSyncMask( 266abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower mask, detail::CudaShuffleUpGetSrcLane(delta, width))); 267abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 268abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_up_sync(mask, value, delta, width); 269abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 270abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_up(value, delta, width); 271abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 272abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 273abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 274abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned 275abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example). 276abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944. 277abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleUpSync(unsigned mask, double value, 278abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned delta, 279abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 280abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lo, hi; 281abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); 282abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower hi = CudaShuffleUpSync(mask, hi, delta, width); 283abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower lo = CudaShuffleUpSync(mask, lo, delta, width); 284abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); 285abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return value; 286abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 287abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 288abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_down_sync. All threads in 'mask' must call this function 289abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// in convergence, see comment above for details. 290abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 291abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline T CudaShuffleDownSync(unsigned mask, T value, unsigned delta, 292abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 293abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(!(width & width - 1)); 294abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(detail::CudaValidateShuffleSyncMask( 295abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower mask, detail::CudaShuffleDownGetSrcLane(delta, width))); 296abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 297abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_down_sync(mask, value, delta, width); 298abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 299abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_down(value, delta, width); 300abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 301abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 302abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 303abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned 304abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example). 305abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944. 306abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleDownSync(unsigned mask, double value, 307abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned delta, 308abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 309abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lo, hi; 310abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); 311abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower hi = CudaShuffleDownSync(mask, hi, delta, width); 312abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower lo = CudaShuffleDownSync(mask, lo, delta, width); 313abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); 314abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return value; 315abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 316abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 317abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __shfl_xor_sync. All threads in 'mask' must call this function in 318abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// convergence, see comment above for details. 319abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 320abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaShuffleXorSync(unsigned mask, T value, int lane_mask, 321abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 322abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(!(width & width - 1)); 323abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(detail::CudaValidateShuffleSyncMask( 324abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower mask, detail::CudaShuffleXorGetSrcLane(lane_mask, width))); 325abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if CUDA_VERSION >= 9000 326abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_xor_sync(mask, value, lane_mask, width); 327abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 328abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __shfl_xor(value, lane_mask, width); 329abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 330abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 331abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 332abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Variant of the (undocumented) version from the CUDA SDK, but using unsigned 333abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// instead of float for lo and hi (which is incorrect with ftz, for example). 334abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// See b/69446944. 335abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaShuffleXorSync(unsigned mask, double value, 336abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int lane_mask, 337abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower int width = warpSize) { 338abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower unsigned lo, hi; 339abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(value)); 340abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower hi = CudaShuffleXorSync(mask, hi, lane_mask, width); 341abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower lo = CudaShuffleXorSync(mask, lo, lane_mask, width); 342abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("mov.b64 %0, {%1,%2};" : "=d"(value) : "r"(lo), "r"(hi)); 343abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return value; 344abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 345abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 346abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Wrapper for __ldg. 347abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 348abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ T CudaLdg(const T* address) { 349abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350 350abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __ldg(address); 351abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 352abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return *address; 353abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 354abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 355abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 356abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline bool CudaLdg(const bool* address) { 357abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return CudaLdg(reinterpret_cast<const char*>(address)) != 0; 358abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 359abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 360abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline std::complex<float> CudaLdg( 361abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower const std::complex<float>* address) { 362abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350 363abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower float2 mem = __ldg(reinterpret_cast<const float2*>(address)); 364abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return std::complex<float>(mem.x, mem.y); 365abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 366abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return *address; 367abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 368abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 369abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 370abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__host__ __device__ inline std::complex<double> CudaLdg( 371abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower const std::complex<double>* address) { 372abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ >= 350 373abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower double2 mem = __ldg(reinterpret_cast<const double2*>(address)); 374abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return std::complex<double>(mem.x, mem.y); 375abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#else 376abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return *address; 377abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 378abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 379abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 380abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Zeroes count elements starting at ptr using all threads of a 1-D grid. 381abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Note: this function does not synchronize, and therefore the memory range is 382abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// not guaranteed to be zero until the next kernel launch. 383abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T> 384abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__global__ void SetZero(const int count, T* ptr) { 385abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower // Check that the grid is one dimensional and index doesn't overflow. 386abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(blockDim.y == 1 && blockDim.z == 1); 387abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x); 388abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower for (int i : CudaGridRangeX(count)) { 389abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower ptr[i] = T(0); 390abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } 391abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 392abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 3933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Helper to set all tensor entries to a specific value. 3943d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename T> 3953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__global__ void SetToValue(const int count, T* ptr, T value) { 3963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil // Check that the grid is one dimensional and index doesn't overflow. 3973d86d8ce14989ca65a59ad4cf37f690694bf6267Phil assert(blockDim.y == 1 && blockDim.z == 1); 3983d86d8ce14989ca65a59ad4cf37f690694bf6267Phil assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x); 3993d86d8ce14989ca65a59ad4cf37f690694bf6267Phil for (int i : CudaGridRangeX(count)) { 4003d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr[i] = value; 4013d86d8ce14989ca65a59ad4cf37f690694bf6267Phil } 4023d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 4033d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 404abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowernamespace detail { 405abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Helper function for atomic accumulation implemented as CAS. 406abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename F> 407abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ T CudaAtomicCasHelper(T* ptr, F accumulate) { 408abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T old = *ptr; 409abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower T assumed; 410abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower do { 411abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower assumed = old; 412abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower old = atomicCAS(ptr, assumed, accumulate(assumed)); 413abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower } while (assumed != old); 414abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return old; 415abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 416abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 417abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Overload for floating point (using integer comparison to handle NaN 418abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// correctly). 419abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename F> 420abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ float CudaAtomicCasHelper(float* ptr, F accumulate) { 421abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __float_as_int( 422abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower CudaAtomicCasHelper(reinterpret_cast<int32*>(ptr), [accumulate](int32 a) { 423abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __float_as_int(accumulate(__int_as_float(a))); 424abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower })); 425abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 426abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename F> 427abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ double CudaAtomicCasHelper(double* ptr, F accumulate) { 428abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __longlong_as_double(CudaAtomicCasHelper( 429abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower reinterpret_cast<tensorflow::uint64*>(ptr), 430abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower [accumulate](tensorflow::uint64 a) { 431abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return __double_as_longlong(accumulate(__longlong_as_double(a))); 432abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower })); 433abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 434abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 4353d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Overload of above function for half. Note that we don't have 4363d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// atomicCAS() for anything less than 32 bits, so we need to include the 4373d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// other 16 bits in the operation. 4383d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// 4393d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// This version is going to be very slow 4403d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// under high concurrency, since most threads will be spinning on failing 4413d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// their compare-and-swap tests. (The fact that we get false sharing on the 4423d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// neighboring fp16 makes this even worse.) If you are doing a large reduction, 4433d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// you are much better off with doing the intermediate steps in fp32 and then 4443d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// switching to fp16 as late as you can in the calculations. 4453d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// 4463d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Note: Assumes little endian. 4473d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename F> 4483d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ Eigen::half CudaAtomicCasHelper(Eigen::half* ptr, F accumulate) { 4493d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) 4503d86d8ce14989ca65a59ad4cf37f690694bf6267Phil static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Not little endian"); 4513d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#endif 4523d86d8ce14989ca65a59ad4cf37f690694bf6267Phil namespace half_impl = Eigen::half_impl; 4533d86d8ce14989ca65a59ad4cf37f690694bf6267Phil intptr_t intptr = reinterpret_cast<intptr_t>(ptr); 4543d86d8ce14989ca65a59ad4cf37f690694bf6267Phil assert(!(intptr & 0x1)); // should be 2-aligned. 4553d86d8ce14989ca65a59ad4cf37f690694bf6267Phil if (intptr & 0x2) { 4563d86d8ce14989ca65a59ad4cf37f690694bf6267Phil // The half is in the second part of the uint32 (upper 16 bits). 4573d86d8ce14989ca65a59ad4cf37f690694bf6267Phil uint32* address = reinterpret_cast<uint32*>(intptr - 2); 4583d86d8ce14989ca65a59ad4cf37f690694bf6267Phil uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) { 4593d86d8ce14989ca65a59ad4cf37f690694bf6267Phil unsigned short high = static_cast<unsigned short>(arg >> 16); 4603d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(high)); 4613d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return (static_cast<uint32>(acc.x) << 16) | (arg & 0xffff); 4623d86d8ce14989ca65a59ad4cf37f690694bf6267Phil }); 4633d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return half_impl::raw_uint16_to_half(static_cast<uint16>(result >> 16)); 4643d86d8ce14989ca65a59ad4cf37f690694bf6267Phil } else { 4653d86d8ce14989ca65a59ad4cf37f690694bf6267Phil // The half is in the first part of the uint32 (lower 16 bits). 4663d86d8ce14989ca65a59ad4cf37f690694bf6267Phil uint32* address = reinterpret_cast<uint32*>(intptr); 4673d86d8ce14989ca65a59ad4cf37f690694bf6267Phil uint32 result = CudaAtomicCasHelper(address, [accumulate](uint32 arg) { 4683d86d8ce14989ca65a59ad4cf37f690694bf6267Phil unsigned short low = static_cast<unsigned short>(arg & 0xffff); 4693d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half acc = accumulate(half_impl::raw_uint16_to_half(low)); 4703d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return (arg & 0xffff0000) | static_cast<uint32>(acc.x); 4713d86d8ce14989ca65a59ad4cf37f690694bf6267Phil }); 4723d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return half_impl::raw_uint16_to_half(static_cast<uint16>(result & 0xffff)); 4733d86d8ce14989ca65a59ad4cf37f690694bf6267Phil } 4743d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 4753d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 476abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename From, typename To> 477abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowerusing ToTypeIfConvertible = 478abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower typename std::enable_if<std::is_convertible<From, To>::value, To>::type; 479abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 480abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} // namespace detail 481abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 482abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// CUDA provides atomic ops, but not for all types. We provide wrappers 483abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// for some ops and provide implementation for all reasonable types. 484abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 485abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U> 486abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicAdd(T* ptr, U value) { 487abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return atomicAdd(ptr, value); 488abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 4893d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 4903d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicAdd(Eigen::half* ptr, 4913d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half value) { 4923d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 4933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](Eigen::half a) { return a + value; }); 4943d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 4953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 4963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 497abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ < 600 498abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicAdd(double* ptr, double value) { 499abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaAtomicCasHelper(ptr, 500abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower [value](double a) { return a + value; }); 501abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 502abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#elif __clang__ 503abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Clang cannot compile __nvvm_atom_add_gen_d builtin yet, use inline PTX. 504abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// see https://reviews.llvm.org/D39638 505abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicAdd(double* ptr, double value) { 506abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower double result; 507abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower asm volatile("atom.add.f64 %0, [%1], %2;" 508abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower : "=d"(result) 509abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower : "l"(ptr), "d"(value) 510abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower : "memory"); 511abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return result; 512abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 513abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 5143d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicAdd 5153d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// Specializations of CudaAtomicAdd for complex types, which CudaAtomicAdd does 5163d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// not support. We treat a std::complex<T>* as a T* (the C++ standard section 5173d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// 26.4.4 allows this explicitly) and atomic add the real and imaginary 5183d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// components individually. The operation as a whole is not atomic, but we can 5193d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// safely treat the components independently for the purpose of accumulating. 5203d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline std::complex<float> CudaAtomicAdd(std::complex<float>* ptr, 5213d86d8ce14989ca65a59ad4cf37f690694bf6267Phil std::complex<float> value) { 5223d86d8ce14989ca65a59ad4cf37f690694bf6267Phil auto ptr_scalar = reinterpret_cast<float*>(ptr); 5233d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return std::complex<float>(CudaAtomicAdd(ptr_scalar, value.real()), 5243d86d8ce14989ca65a59ad4cf37f690694bf6267Phil CudaAtomicAdd(ptr_scalar + 1, value.imag())); 5253d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5263d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5273d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline std::complex<double> CudaAtomicAdd( 5283d86d8ce14989ca65a59ad4cf37f690694bf6267Phil std::complex<double>* ptr, std::complex<double> value) { 5293d86d8ce14989ca65a59ad4cf37f690694bf6267Phil auto ptr_scalar = reinterpret_cast<double*>(ptr); 5303d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return std::complex<double>(CudaAtomicAdd(ptr_scalar, value.real()), 5313d86d8ce14989ca65a59ad4cf37f690694bf6267Phil CudaAtomicAdd(ptr_scalar + 1, value.imag())); 5323d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5333d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5343d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicSub 535abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U> 536abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicSub(T* ptr, U value) { 537abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return atomicSub(ptr, value); 538abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 5393d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 540abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower// Specializations of substraction which add the negative value. 541abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline float CudaAtomicSub(float* ptr, float value) { 542abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return CudaAtomicAdd(ptr, -value); 543abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 5443d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 545abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline double CudaAtomicSub(double* ptr, double value) { 546abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return CudaAtomicAdd(ptr, -value); 547abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 5483d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 549abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline tensorflow::uint64 CudaAtomicSub(tensorflow::uint64* ptr, 550abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower tensorflow::uint64 value) { 551abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return CudaAtomicAdd(ptr, -value); 552abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 553abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 5543d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicSub(Eigen::half* ptr, 5553d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half value) { 5563d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 5573d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](Eigen::half a) { return a - value; }); 5583d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5593d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5603d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMax 561abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U> 562abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMax(T* ptr, U value) { 563abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return atomicMax(ptr, value); 564abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 5653d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5663d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline float CudaAtomicMax(float* ptr, float value) { 5673d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 5683d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](float a) { return max(a, value); }); 5693d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5703d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5713d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline double CudaAtomicMax(double* ptr, double value) { 5723d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 5733d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](double a) { return max(a, value); }); 5743d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5753d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5763d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicMax(Eigen::half* ptr, 5773d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half value) { 5783d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 5793d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](Eigen::half a) { return max(a, value); }); 5803d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5813d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 582abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#if __CUDA_ARCH__ < 320 583abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ inline tensorflow::uint64 CudaAtomicMax(tensorflow::uint64* ptr, 584abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower tensorflow::uint64 value) { 585abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaAtomicCasHelper( 586abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower ptr, [value](tensorflow::uint64 a) { return max(a, value); }); 587abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 588abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif 589abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 5903d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMin 5913d86d8ce14989ca65a59ad4cf37f690694bf6267Philtemplate <typename T, typename U> 5923d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMin(T* ptr, U value) { 5933d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return atomicMin(ptr, value); 5943d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 5953d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 5963d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline float CudaAtomicMin(float* ptr, float value) { 5973d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 5983d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](float a) { return min(a, value); }); 5993d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 6003d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 6013d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline double CudaAtomicMin(double* ptr, double value) { 6023d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 6033d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](double a) { return min(a, value); }); 6043d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 6053d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 6063d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline Eigen::half CudaAtomicMin(Eigen::half* ptr, 6073d86d8ce14989ca65a59ad4cf37f690694bf6267Phil Eigen::half value) { 6083d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 6093d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](Eigen::half a) { return min(a, value); }); 6103d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 6113d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 6123d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#if __CUDA_ARCH__ < 320 6133d86d8ce14989ca65a59ad4cf37f690694bf6267Phil__device__ inline tensorflow::uint64 CudaAtomicMin(tensorflow::uint64* ptr, 6143d86d8ce14989ca65a59ad4cf37f690694bf6267Phil tensorflow::uint64 value) { 6153d86d8ce14989ca65a59ad4cf37f690694bf6267Phil return detail::CudaAtomicCasHelper( 6163d86d8ce14989ca65a59ad4cf37f690694bf6267Phil ptr, [value](tensorflow::uint64 a) { return min(a, value); }); 6173d86d8ce14989ca65a59ad4cf37f690694bf6267Phil} 6183d86d8ce14989ca65a59ad4cf37f690694bf6267Phil#endif 6193d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 6203d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicMul 621abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U> 622abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicMul(T* ptr, U value) { 623abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a * value; }); 624abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 6253d86d8ce14989ca65a59ad4cf37f690694bf6267Phil 6263d86d8ce14989ca65a59ad4cf37f690694bf6267Phil// CudaAtomicDiv 627abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlowertemplate <typename T, typename U> 628abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower__device__ detail::ToTypeIfConvertible<U, T> CudaAtomicDiv(T* ptr, U value) { 629abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower return detail::CudaAtomicCasHelper(ptr, [value](T a) { return a / value; }); 630abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} 631abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 632abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower} // namespace tensorflow 633abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower 634abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif // GOOGLE_CUDA 635abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower#endif // TENSORFLOW_CORE_UTIL_CUDA_KERNEL_HELPER_H_ 636