1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#if GOOGLE_CUDA 17#define EIGEN_USE_GPU 18#include "cuda/include/cuda.h" 19#include "tensorflow/core/kernels/fused_batch_norm_op.h" 20#include "tensorflow/core/util/cuda_kernel_helper.h" 21 22namespace tensorflow { 23namespace functor { 24 25template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, float, float>; 26template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, Eigen::half, float>; 27 28template <class T> 29__global__ void VarianceToInvVarianceKernel(int nthreads, const T* input, 30 double epsilon, T* output) { 31 CUDA_1D_KERNEL_LOOP(index, nthreads) { 32 output[index] = rsqrt(input[index] + T(epsilon)); 33 } 34} 35 36template <class T> 37void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d, 38 const T* variance, double epsilon, 39 int channels, T* inv_variance) { 40 CudaLaunchConfig config = GetCudaLaunchConfig(channels, d); 41 VarianceToInvVarianceKernel<<<config.block_count, config.thread_per_block, 0, 42 d.stream()>>>(config.virtual_thread_count, 43 variance, epsilon, inv_variance); 44} 45 46template <class T> 47__global__ void InvVarianceToVarianceKernel(int nthreads, double epsilon, 48 int sample_size, T* variance) { 49 CUDA_1D_KERNEL_LOOP(index, nthreads) { 50 T inv_var = variance[index]; 51 T var = __fdividef(1, inv_var * inv_var) - T(epsilon); 52 // This is for Bessel's correction 53 var *= T(sample_size) / T((sample_size > 1) ? sample_size - 1 : 1); 54 variance[index] = (var > 0) ? var : 0; 55 } 56} 57 58template <class T> 59void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d, 60 double epsilon, int sample_size, 61 int channels, T* variance) { 62 CudaLaunchConfig config = GetCudaLaunchConfig(channels, d); 63 InvVarianceToVarianceKernel<<<config.block_count, config.thread_per_block, 0, 64 d.stream()>>>(config.virtual_thread_count, 65 epsilon, sample_size, variance); 66} 67 68template <class T> 69void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d, 70 typename TTypes<T>::Flat out) { 71 To32Bit(out).device(d) = 72 To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN()); 73} 74 75template class VarianceToInvVariance<float>; 76template class InvVarianceToVariance<float>; 77template class SetNanFunctor<float>; 78} // namespace functor 79} // namespace tensorflow 80 81#else 82 83#include "tensorflow/core/kernels/fused_batch_norm_op.h" 84 85#endif // GOOGLE_CUDA 86