1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16#if GOOGLE_CUDA 17 18#define EIGEN_USE_GPU 19 20#include "tensorflow/core/kernels/eye_functor.h" 21 22#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 23#include "tensorflow/core/framework/tensor_types.h" 24#include "tensorflow/core/platform/types.h" 25#include "tensorflow/core/util/cuda_kernel_helper.h" 26 27namespace tensorflow { 28namespace functor { 29 30typedef Eigen::GpuDevice GPUDevice; 31 32template <typename Scalar> 33__global__ void EyeKernel(int num_threads, int batch_size, int m, int n, 34 Scalar* output_ptr) { 35 const Scalar one = Scalar(1); 36 const Scalar zero = Scalar(0); 37 CUDA_1D_KERNEL_LOOP(index, num_threads) { 38 // TODO(rmlarsen): Benchmark to see if it's just as fast to use mod (%), 39 // since it's easier to read. 40 const int global_row = index / n; 41 const int col = index - global_row * n; 42 const int batch = global_row / m; 43 const int row = global_row - batch * m; 44 output_ptr[index] = col == row ? one : zero; 45 } 46} 47 48template <typename Scalar> 49struct EyeFunctor<GPUDevice, Scalar> { 50 void operator()(const GPUDevice& device, 51 typename TTypes<Scalar, 3>::Tensor matrix_batch) { 52 const int batch_size = matrix_batch.dimension(0); 53 const int m = matrix_batch.dimension(1); 54 const int n = matrix_batch.dimension(2); 55 CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device); 56 EyeKernel<<<config.block_count, config.thread_per_block, 0, 57 device.stream()>>>(config.virtual_thread_count, batch_size, m, 58 n, matrix_batch.data()); 59 } 60}; 61 62template struct EyeFunctor<GPUDevice, float>; 63template struct EyeFunctor<GPUDevice, double>; 64template struct EyeFunctor<GPUDevice, complex64>; 65template struct EyeFunctor<GPUDevice, complex128>; 66 67} // namespace functor 68} // namespace tensorflow 69 70#endif // GOOGLE_CUDA 71