1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#if GOOGLE_CUDA
17
18#define EIGEN_USE_GPU
19
20#include "tensorflow/core/kernels/eye_functor.h"
21
22#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
23#include "tensorflow/core/framework/tensor_types.h"
24#include "tensorflow/core/platform/types.h"
25#include "tensorflow/core/util/cuda_kernel_helper.h"
26
27namespace tensorflow {
28namespace functor {
29
30typedef Eigen::GpuDevice GPUDevice;
31
32template <typename Scalar>
33__global__ void EyeKernel(int num_threads, int batch_size, int m, int n,
34                          Scalar* output_ptr) {
35  const Scalar one = Scalar(1);
36  const Scalar zero = Scalar(0);
37  CUDA_1D_KERNEL_LOOP(index, num_threads) {
38    // TODO(rmlarsen): Benchmark to see if it's just as fast to use mod (%),
39    // since it's easier to read.
40    const int global_row = index / n;
41    const int col = index - global_row * n;
42    const int batch = global_row / m;
43    const int row = global_row - batch * m;
44    output_ptr[index] = col == row ? one : zero;
45  }
46}
47
48template <typename Scalar>
49struct EyeFunctor<GPUDevice, Scalar> {
50  void operator()(const GPUDevice& device,
51                  typename TTypes<Scalar, 3>::Tensor matrix_batch) {
52    const int batch_size = matrix_batch.dimension(0);
53    const int m = matrix_batch.dimension(1);
54    const int n = matrix_batch.dimension(2);
55    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * m * n, device);
56    EyeKernel<<<config.block_count, config.thread_per_block, 0,
57                device.stream()>>>(config.virtual_thread_count, batch_size, m,
58                                   n, matrix_batch.data());
59  }
60};
61
62template struct EyeFunctor<GPUDevice, float>;
63template struct EyeFunctor<GPUDevice, double>;
64template struct EyeFunctor<GPUDevice, complex64>;
65template struct EyeFunctor<GPUDevice, complex128>;
66
67}  // namespace functor
68}  // namespace tensorflow
69
70#endif  // GOOGLE_CUDA
71