1/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#if GOOGLE_CUDA
17#define EIGEN_USE_GPU
18#include "cuda/include/cuda.h"
19#include "tensorflow/core/kernels/fused_batch_norm_op.h"
20#include "tensorflow/core/util/cuda_kernel_helper.h"
21
22namespace tensorflow {
23namespace functor {
24
25template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, float, float>;
26template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, Eigen::half, float>;
27
28template <class T>
29__global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
30                                            double epsilon, T* output) {
31  CUDA_1D_KERNEL_LOOP(index, nthreads) {
32    output[index] = rsqrt(input[index] + T(epsilon));
33  }
34}
35
36template <class T>
37void VarianceToInvVariance<T>::operator()(const Eigen::GpuDevice& d,
38                                          const T* variance, double epsilon,
39                                          int channels, T* inv_variance) {
40  CudaLaunchConfig config = GetCudaLaunchConfig(channels, d);
41  VarianceToInvVarianceKernel<<<config.block_count, config.thread_per_block, 0,
42                                d.stream()>>>(config.virtual_thread_count,
43                                              variance, epsilon, inv_variance);
44}
45
46template <class T>
47__global__ void InvVarianceToVarianceKernel(int nthreads, double epsilon,
48                                            int sample_size, T* variance) {
49  CUDA_1D_KERNEL_LOOP(index, nthreads) {
50    T inv_var = variance[index];
51    T var = __fdividef(1, inv_var * inv_var) - T(epsilon);
52    // This is for Bessel's correction
53    var *= T(sample_size) / T((sample_size > 1) ? sample_size - 1 : 1);
54    variance[index] = (var > 0) ? var : 0;
55  }
56}
57
58template <class T>
59void InvVarianceToVariance<T>::operator()(const Eigen::GpuDevice& d,
60                                          double epsilon, int sample_size,
61                                          int channels, T* variance) {
62  CudaLaunchConfig config = GetCudaLaunchConfig(channels, d);
63  InvVarianceToVarianceKernel<<<config.block_count, config.thread_per_block, 0,
64                                d.stream()>>>(config.virtual_thread_count,
65                                              epsilon, sample_size, variance);
66}
67
68template <class T>
69void SetNanFunctor<T>::operator()(const Eigen::GpuDevice& d,
70                                  typename TTypes<T>::Flat out) {
71  To32Bit(out).device(d) =
72      To32Bit(out).constant(Eigen::NumTraits<T>::quiet_NaN());
73}
74
75template class VarianceToInvVariance<float>;
76template class InvVarianceToVariance<float>;
77template class SetNanFunctor<float>;
78}  // namespace functor
79}  // namespace tensorflow
80
81#else
82
83#include "tensorflow/core/kernels/fused_batch_norm_op.h"
84
85#endif  // GOOGLE_CUDA
86