depthwise_conv_op_gpu.cu.cc revision aad2e3daff8fcd29ed8e5071d4c37a7f94a0421c
1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
3b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenLicensed under the Apache License, Version 2.0 (the "License");
4b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenyou may not use this file except in compliance with the License.
5b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenYou may obtain a copy of the License at
6b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
7b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    http://www.apache.org/licenses/LICENSE-2.0
8b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
9b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenUnless required by applicable law or agreed to in writing, software
10b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chendistributed under the License is distributed on an "AS IS" BASIS,
11b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenSee the License for the specific language governing permissions and
13b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenlimitations under the License.
14b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen==============================================================================*/
15b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
16b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#if GOOGLE_CUDA
17b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define EIGEN_USE_GPU
18b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
19b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/kernels/depthwise_conv_op.h"
21b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/platform/types.h"
22b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/util/cuda_kernel_helper.h"
23ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/util/tensor_format.h"
24b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
25e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#if !defined(_MSC_VER)
26b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define UNROLL _Pragma("unroll")
277828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL _Pragma("nounroll")
28e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#else
29ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#define UNROLL
307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL
31e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#endif
32b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
33b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chennamespace tensorflow {
34b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowerusing Eigen::GpuDevice;
36b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
37aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Returns whether depthwise convolution forward pass can be performed using the
38aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// faster ('Small') variant of the kernel.
39aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowerEIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
40aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const DepthwiseArgs args) {
41aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 16 &&
42aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.in_cols <= 16 && args.in_rows == args.out_rows &&
43aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
44aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
45aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.pad_cols < args.filter_cols &&
46aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.filter_rows * args.filter_cols <=
47aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower             (args.in_rows + 1) / 2 * args.in_cols;
48aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
49aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
50ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
51ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NHWC format.
527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
537828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
57b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_rows = args.in_rows;
58b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_cols = args.in_cols;
59b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_depth = args.in_depth;
607828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
617828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
627828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
637828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
647828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
66b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int stride = args.stride;
67b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int pad_rows = args.pad_rows;
68b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int pad_cols = args.pad_cols;
69b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_rows = args.out_rows;
70b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_cols = args.out_cols;
71b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_depth = args.out_depth;
72b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
73b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
74b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the indexes of this thread in the output.
75b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OD = thread_id % out_depth;
76b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OC = (thread_id / out_depth) % out_cols;
77b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OR = (thread_id / out_depth / out_cols) % out_rows;
78b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OB = thread_id / out_depth / out_cols / out_rows;
79b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the input depth and the index of depth multiplier.
80b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int in_d = OD / depth_multiplier;
81b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int multiplier = OD % depth_multiplier;
82b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
83ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
84ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
85b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_row_start = OR * stride - pad_rows;
86b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_col_start = OC * stride - pad_cols;
87b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_row_end = input_row_start + filter_rows;
88b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_col_end = input_col_start + filter_cols;
89b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    T sum = 0;
915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int input_offset_temp = in_rows * OB;
93b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    if (input_row_start >= 0 && input_col_start >= 0 &&
94b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        input_row_end < in_rows && input_col_end < in_cols) {
95b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
965f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = input_row_start + f_r;
975f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int filter_offset_temp = filter_cols * f_r;
98b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = input_col_start + f_c;
1005f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
1015f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int input_offset =
1025f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
1035f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int filter_offset =
1045f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              multiplier +
1055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
1065f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
107b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
108b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
109b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    } else {
110b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
1115f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = input_row_start + f_r;
1125f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int filter_offset_temp = filter_cols * f_r;
113b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
1145f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = input_col_start + f_c;
115b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
1165f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int in_c = input_col_start + f_c;
1175f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
1185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
1195f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
1205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int filter_offset =
121ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                multiplier + depth_multiplier *
122ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                 (in_d + in_depth * (f_c + filter_offset_temp));
1235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
124b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen          }
125b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
126b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
127b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    }
1285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    output[thread_id] = sum;
129b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
130b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}
131ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
1333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// tailored for small images up to 16x16. Stride and depth multiplier must be 1.
134aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only
135aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
1363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before
1373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration,
1383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// one each in the lower and upper half of a tile.
1393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
1403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          bool kKnownEvenRows>
1413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
1423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
143aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
1443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.x depths.
1453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
1463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
1473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batches = args.batch;
1493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_rows = args.in_rows;
1503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_cols = args.in_cols;
1513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_depth = args.in_depth;
1523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_rows =
1533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
1543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_cols =
1553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
1563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_rows = args.pad_rows;
1573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_cols = args.pad_cols;
1583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
1603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_slices = 8;
1613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_rows = blockDim.z;
1623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // These values are the same for all threads and could
1643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // be precomputed on the CPU.
1655cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int block_size = block_rows * in_cols * block_slices;
1663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_row_size = in_cols * in_depth;
1673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_size = in_rows * in_row_size;
1683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_increment = (in_cols - 1) * block_slices;
1695cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = filter_rows * filter_cols;
1703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = in_cols + filter_cols - 1;
1713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
1723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = in_rows + filter_rows - even_rows;
1733f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_row_size = tile_cols * block_slices;
1743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_size = tile_rows * tile_row_size;
1753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_offset = block_rows * tile_row_size;
1763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_offset = pad_rows * tile_cols + pad_cols;
1773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batch_blocks = (in_depth + block_slices - 1) / block_slices;
1783f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_blocks = batch_blocks * batches;
1793f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_offset =
1803f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
1813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_depth = threadIdx.x;
1833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_col = threadIdx.y;
1843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_row = threadIdx.z;
1853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in block.
1875cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int thread_pix = thread_row * in_cols + thread_col;
1883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_idx = thread_pix * block_slices + thread_depth;
1893f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Initialize tile, in particular the padding.
1913f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size; i += block_size) {
1923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    shared_data[i] = T(0);
1933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
1943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  __syncthreads();
1953f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in tensors.
1973f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_idx = thread_pix * in_depth + thread_depth;
1983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1993f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in (padded) shared memory.
2003f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_pix = thread_row * tile_cols + thread_col;
2013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_idx = data_pix * block_slices + thread_depth;
2023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in shared memory, offset by pad_rows / pad_cols.
2043f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_pix = data_pix + pad_offset;
2053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_idx = tile_pix * block_slices + thread_depth;
2063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int max_depth = in_depth - thread_depth;
2083f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_write_offset =
2095cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
2103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_read_offset = tile_size + thread_depth;
2113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const bool skip_second =
2123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
2133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
2153f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int batch = b / batch_blocks;
2163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int stack = b - batch * batch_blocks;
2173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int start_depth = stack * block_slices;
2193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int filter_offset = tensor_idx + start_depth;
2203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int inout_offset = batch * in_size + filter_offset;
2213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const bool depth_in_range = start_depth < max_depth;
2223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
2243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
2253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
2263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
2273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
2283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
2293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
2303f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (filter_write_offset != 0) {
2323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_data[filter_write_offset] = ldg(filter_offset + filter);
2333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
2343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
2353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
2373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
2383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
2403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T sum1 = 0;
2413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T sum2 = 0;
2423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      int shared_offset = data_idx;
2433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* filter_ptr = filter_read_offset + shared_data;
2443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_rows; ++r) {
2453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_cols; ++c) {
2463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T filter_value = *filter_ptr;
2473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
2483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum1 += filter_value * tile_ptr[0];
2493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum2 += filter_value * tile_ptr[tile_offset];
2503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          shared_offset += block_slices;
2513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          filter_ptr += block_slices;
2523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
2533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_offset += in_increment;
2543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
2553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const out_ptr = inout_offset + output;
2563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      out_ptr[0] = sum1;
2573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
2583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        out_ptr[tensor_offset] = sum2;
2593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
2603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
2613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
2623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
2633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
2643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
2653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
2663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
267ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
268ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NCHW format.
2697828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
2707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
2713c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
2723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
2733c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
274ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
275ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
276ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
2777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
2787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
2797828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
2807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
2817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
2827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
283ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
284ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
285ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
286ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
287ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
288ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
289ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
290ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
291ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
292ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
293ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We want coalesced reads so we make sure that each warp reads
294ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // a contiguous chunk of memory.
295ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
296ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // THIS IS PROBABLY WRONG, we are not doing coalesced reads
297ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // into the input, because of the depth multiplier division...
298ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OC = thread_id % out_cols;
299ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OR = (thread_id / out_cols) % out_rows;
300ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OD = (thread_id / out_cols / out_rows) % out_depth;
301ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OB = thread_id / out_cols / out_rows / out_depth;
302ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
303ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier
304ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // based off the output depth index that this thread is
305ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // computing n.
306ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = OD / depth_multiplier;
307ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int multiplier = OD % depth_multiplier;
308ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
309ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Data is stored in the following format (let's assume we
310ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // flatten the height and width into one contiguous dimension
311ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // called "P".
312ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
313ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
314ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
315ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
316ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Each row contains in_depth * in_rows * in_cols values
317ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each sample in the batch.
318ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
319ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can further flatten it into:
320ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
321ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 .....
322ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C2P1 B1C2P2 ....
323ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 .....
324ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C2P1 B2C2P2 ....
325ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
326ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // where each row is a contiguous array of all of the spatial
327ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // pixels for a given batch and input depth.  The following
328ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // loop unrolls across the filter dimensions for a given thread,
329ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // indexing into the filter value and the corresponding input
330ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // patch.
331ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
332ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can compute the index into the patch once right here.
333ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols);
334ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
335ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Finally, we can iterate over the spatial dimensions and perform the
336ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // convolution, writing into the output at the end.
337ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
338ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We perform an additional optimization, where we can determine
339ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // whether the patch fits within the image indices statically, and
340ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // avoid boundary checking within the loop.
341ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_row_start = OR * stride - pad_rows;
342ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_col_start = OC * stride - pad_cols;
343ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_row_end = input_row_start + filter_rows;
344ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_col_end = input_col_start + filter_cols;
345ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
346ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    T sum = 0;
347ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    if (input_row_start >= 0 && input_col_start >= 0 &&
348ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        input_row_end < in_rows && input_col_end < in_cols) {
349ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that doesn't need to check for boundary conditions.
350ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
351ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = input_row_start + f_r;
352ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_offset_temp = filter_cols * f_r;
353ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
354ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = input_col_start + f_c;
355ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
356ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int input_offset =
357ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (input_offset_temp) + (in_r * in_cols) + in_c;
358ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
359ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              multiplier +
360ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
361ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
362ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
363ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
364ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
365ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that needs to check for boundary conditions.
366ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
367ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = input_row_start + f_r;
368ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_offset_temp = filter_cols * f_r;
369ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
370ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = input_col_start + f_c;
371ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          // TODO(vrv): the in_r check can be done outside of this loop;
372ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          // benchmark both methods to determine the better decision.
373ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
374ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int in_c = input_col_start + f_c;
375ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
376ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // input_offset_temp indexes into the start of memory
377ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // where the spatial data starts.
378ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int input_offset =
379ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                (input_offset_temp) + (in_r * in_cols) + in_c;
380ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
381ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int filter_offset =
382ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                multiplier + depth_multiplier *
383ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                 (in_d + in_depth * (f_c + filter_offset_temp));
384ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
385ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
386ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
387ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
388ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
389ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
390ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    output[thread_id] = sum;
391ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
392ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
393ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
394aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
395aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// tailored for small images up to 16x16. Stride and depth multiplier must be 1.
396aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only
397aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
398aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before
399aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration,
400aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// one each in the lower and upper half of a tile.
401aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
402aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          bool kKnownEvenRows>
403aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
404aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
405aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
406aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.z depths.
407aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
408aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
409aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
410aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int batches = args.batch;
411aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_rows = args.in_rows;
412aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_cols = args.in_cols;
413aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_depth = args.in_depth;
414aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_rows =
415aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
416aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_cols =
417aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
418aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int pad_rows = args.pad_rows;
419aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int pad_cols = args.pad_cols;
420aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
421aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
422aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_rows = blockDim.y;
423aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_slices = 8;
424aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
425aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // These values are the same for all threads and could
426aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // be precomputed on the CPU.
427aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_pixels = in_cols * block_rows;
428aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_size = block_pixels * block_slices;
429aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_pixels = in_cols * in_rows;
430aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_increment = in_cols - 1;
431aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_pixels = filter_rows * filter_cols;
432aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_cols = in_cols + filter_cols - 1;
433aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
434aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_rows = in_rows + filter_rows - even_rows;
435aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_pixels = tile_cols * tile_rows;
436aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_size = tile_pixels * block_slices;
437aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_offset = block_rows * tile_cols;
438aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int pad_offset = pad_rows * tile_cols + pad_cols;
439aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_slices = in_depth * batches;
440aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_blocks = (in_slices + block_slices - 1) / block_slices;
441aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
442aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_col = threadIdx.x;
443aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_row = threadIdx.y;
444aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_depth = threadIdx.z;
445aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
446aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in block.
447aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_pix = thread_row * in_cols + thread_col;
448aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_idx = thread_depth * block_pixels + thread_pix;
449aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
450aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Initialize tile, in particular the padding.
451aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size; i += block_size) {
452aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    shared_data[i] = T(0);
4533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
454aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  __syncthreads();
4553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
456aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in tensors.
457aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tensor_idx = thread_depth * in_pixels + thread_pix;
458aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
459aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in (padded) shared memory.
460aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int data_pix = thread_row * tile_cols + thread_col;
461aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int data_idx = thread_depth * tile_pixels + data_pix;
462aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
463aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in shared memory, offset by pad_rows / pad_cols.
464aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_idx = data_idx + pad_offset;
465aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
466aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Filter is always in HWCK format, irrespective of the input/output format.
467aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_pix = thread_idx / block_slices;
468aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_depth = thread_idx % block_slices;
469aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_idx = filter_pix * in_depth;
470aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
471aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int max_slice = in_slices - thread_depth;
472aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_write_offset =
473aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
474aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_read_offset = tile_size + thread_depth;
475aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const bool skip_second =
476aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
477aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
478aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
479aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const int slice = b * block_slices;
480aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
481aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const int inout_offset = slice * in_pixels + tensor_idx;
482aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const bool slice_in_range = slice < max_slice;
483aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
484aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    if (slice_in_range) {
485aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
486aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
487aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
488aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      if (!skip_second) {
489aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
490aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
491aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
492aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
493aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    if (filter_write_offset != 0) {
494aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      const int filter_offset = filter_idx + (slice + filter_depth) % in_depth;
495aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      shared_data[filter_write_offset] = ldg(filter_offset + filter);
496aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
497aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
498aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
499aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    __syncthreads();
500aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
501aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    if (slice_in_range) {
502aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T sum1 = 0;
503aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T sum2 = 0;
504aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      int shared_offset = data_idx;
505aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      const T* filter_ptr = filter_read_offset + shared_data;
506aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_rows; ++r) {
507aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_cols; ++c) {
508aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          const T filter_value = *filter_ptr;
509aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
510aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          sum1 += filter_value * tile_ptr[0];
511aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          sum2 += filter_value * tile_ptr[tile_offset];
512aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          ++shared_offset;
513aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          filter_ptr += block_slices;
514aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        }
515aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        shared_offset += in_increment;
516aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
517aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T* const out_ptr = inout_offset + output;
518aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      out_ptr[0] = sum1;
519aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      if (!skip_second) {
520aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        out_ptr[block_pixels] = sum2;
521aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
522aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
523aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
524aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
525aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    __syncthreads();
5263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
527aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
5283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
529aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
530aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          bool kKnownEvenRows>
531aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
532aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   const T* input, const T* filter, T* output,
533aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   TensorFormat data_format) {
534aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_rows = (args.in_rows + 1) / 2;
535aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int block_slices = 8;
5363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = args.in_cols + args.filter_cols - 1;
5373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
5385cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int tile_pixels = tile_rows * tile_cols;
5395cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = args.filter_rows * args.filter_cols;
5403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
541aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int shared_memory_size =
542aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      block_slices * (tile_pixels + filter_pixels) * sizeof(T);
5433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int num_outputs =
5443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
545aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
546aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
547aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    dim3 block_dim = dim3(block_slices, args.in_cols, block_rows);
5483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
5493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        num_outputs, d,
5503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth,
551aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                          kKnownFilterHeight, kKnownEvenRows>,
5523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
5533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWCSmall<T, kKnownFilterWidth, kKnownFilterHeight,
554aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                      kKnownEvenRows>
5553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
5563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            args, input, filter, output);
557aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
558aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    dim3 block_dim = dim3(args.in_cols, block_rows, block_slices);
5593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
5603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        num_outputs, d,
561aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth,
562aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                          kKnownFilterHeight, kKnownEvenRows>,
5633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
564aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHWSmall<T, kKnownFilterWidth, kKnownFilterHeight,
565aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                      kKnownEvenRows>
5663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
5673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            args, input, filter, output);
568aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  } else {
569aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    assert(false);
570aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  }
571aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
572aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
573aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
574aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& d, const DepthwiseArgs args,
575aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   const T* input, const T* filter, T* output,
576aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   TensorFormat data_format) {
577aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  if (args.in_rows & 1) {
578aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
579aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                  /*kKnownEvenRows=*/false>(
580aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        d, args, input, filter, output, data_format);
581aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  } else {
582aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight,
583aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                  /*kKnownEvenRows=*/true>(
584aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        d, args, input, filter, output, data_format);
5853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
5863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
5873f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
5887828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
5897828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
5907828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
5917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                              const T* input, const T* filter, T* output,
5927828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                              TensorFormat data_format) {
593aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
594aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kKnownFilterWidth, kKnownFilterHeight>(
595aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        d, args, input, filter, output, data_format);
5963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return;
5973f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
5987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_outputs =
5997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
6003c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  // The compile-time constant version runs faster with a single block.
6013c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
6023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                      kKnownDepthMultiplier < 0
6033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                  ? std::numeric_limits<int>::max()
6043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                  : d.getNumCudaMultiProcessors();
6057828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
6063c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
6073c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_outputs, d,
6083c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
6093c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                     kKnownDepthMultiplier>,
6103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
6117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
6127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                 kKnownDepthMultiplier>
6133c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        <<<std::min(max_block_count, config.block_count),
6143c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
6153c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                                     output, num_outputs);
6167828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
6173c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
6183c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_outputs, d,
6193c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
6203c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                     kKnownDepthMultiplier>,
6213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
6227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
6237828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                 kKnownDepthMultiplier>
6243c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        <<<std::min(max_block_count, config.block_count),
6253c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
6263c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                                     output, num_outputs);
6277828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
6287828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
6297828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
6307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
631b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
632b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
633b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate <typename T>
634b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenstruct DepthwiseConv2dGPULaunch {
6357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
636ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* filter, T* output, TensorFormat data_format) {
6377828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.filter_rows == 3 && args.filter_cols == 3 &&
6387828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        args.depth_multiplier == 1) {
6397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output,
6407828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           data_format);
641ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
6427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output,
6437828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                              data_format);
644ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
645b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
646b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen};
647b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
648b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<float>;
649b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<double>;
650b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
6515f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
6527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
6537828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
6543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
6553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
6563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* out_backprop,
6573c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* filter, T* in_backprop,
6583c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              int num_in_backprop) {
6595f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_rows = args.in_rows;
6605f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_cols = args.in_cols;
6615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
6627828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
6637828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
6647828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
6657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
6667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
6677828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
6685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
6695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_rows = args.pad_rows;
6705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_cols = args.pad_cols;
6715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_rows = args.out_rows;
6725f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_cols = args.out_cols;
6735f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
6745f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
6755f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
6765f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
6775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_d = thread_id % in_depth;
6785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c = (thread_id / in_depth) % in_cols;
6795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r = (thread_id / in_depth / in_cols) % in_rows;
6805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int b = thread_id / in_depth / in_cols / in_rows;
6815f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
6825f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    T sum = 0;
6835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
6845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r_start =
6855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
6865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
6875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c_start =
6885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
6895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
6905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
6917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
6922f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int f_r = in_r + pad_rows - out_r * stride;
6932f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int temp_out_backprop_offset =
6942f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower          out_depth * out_cols * (out_r + out_rows * b);
6952f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int temp_filter_offset = filter_cols * f_r;
6967828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
6972f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        const int f_c = in_c + pad_cols - out_c * stride;
6982f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        int filter_offset =
6992f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower            depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
7002f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        const int out_backprop_offset =
7012f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower            out_depth * out_c + temp_out_backprop_offset;
702b286574da19e18371e759fe6b676bb07728ef9acA. Unique TensorFlower#pragma unroll 6
7032f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        for (int i = 0; i < depth_multiplier; ++i) {
7042f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower          sum += ldg(out_backprop + out_backprop_offset +
7052f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower                     in_d * depth_multiplier + i) *
7062f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower                 ldg(filter + filter_offset + i);
7075f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
7085f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
7095f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
7105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_backprop_offset =
7115f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
7125f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    in_backprop[in_backprop_offset] = sum;
7135f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
7145f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
7155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
7163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. input in
7173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// NCHW format, tailored for small images up to 16x16. Stride and depth
7183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME', which allows to reuse the index
7193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// computation.
7203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Implementation is the same as the forward pass, except that the filter is
7213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// rotate by 180°, see filter_read_offset and filter_ptr.
7223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before
7233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration,
7243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// one each in the lower and upper half of a tile.
7253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          bool kKnownEvenRows>
7273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__
7283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropInputGPUKernelNHWCSmall(
7293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
7303f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.x depths.
7313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
7323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
7333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batches = args.batch;
7353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_rows = args.in_rows;
7363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_cols = args.in_cols;
7373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_depth = args.in_depth;
7383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_rows =
7393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
7403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_cols =
7413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_rows = args.pad_rows;
7433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_cols = args.pad_cols;
7443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
7463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_slices = 8;
7473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_rows = blockDim.z;
7483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // These values are the same for all threads and could
7503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // be precomputed on the CPU.
7515cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int block_size = block_rows * in_cols * block_slices;
7523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_row_size = in_cols * in_depth;
7533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_size = in_rows * in_row_size;
7543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_increment = (in_cols - 1) * block_slices;
7555cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = filter_rows * filter_cols;
7563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = in_cols + filter_cols - 1;
7573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int even_rows = kKnownEvenRows || (1 & ~in_rows);
7583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = in_rows + filter_rows - even_rows;
7593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_row_size = tile_cols * block_slices;
7603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_size = tile_rows * tile_row_size;
7613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_offset = block_rows * tile_row_size;
7623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_offset = pad_rows * tile_cols + pad_cols;
7633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batch_blocks = (in_depth + block_slices - 1) / block_slices;
7643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_blocks = batch_blocks * batches;
7653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_offset =
7663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownEvenRows ? in_size / 2 : block_rows * in_row_size;
7673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_depth = threadIdx.x;
7693f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_col = threadIdx.y;
7703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_row = threadIdx.z;
7713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in block.
7735cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int thread_pix = thread_row * in_cols + thread_col;
7743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_idx = thread_pix * block_slices + thread_depth;
7753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Initialize tile, in particular the padding.
7773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size; i += block_size) {
7783f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    shared_data[i] = T(0);
7793f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
7803f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  __syncthreads();
7813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in tensors.
7833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_idx = thread_pix * in_depth + thread_depth;
7843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in (padded) shared memory.
7863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_pix = thread_row * tile_cols + thread_col;
7873f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_idx = data_pix * block_slices + thread_depth;
7883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7893f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in shared memory, offset by pad_rows / pad_cols.
7903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_pix = data_pix + pad_offset;
7913f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_idx = tile_pix * block_slices + thread_depth;
7923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int max_depth = in_depth - thread_depth;
7943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_write_offset =
7955cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
7963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_read_offset =
7975cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      tile_size + filter_pixels * block_slices + thread_depth;
7983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const bool skip_second =
7993f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      !kKnownEvenRows && thread_row + (in_rows & 1) == block_rows;
8003f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
8023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int batch = b / batch_blocks;
8033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int stack = b - batch * batch_blocks;
8043f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int start_depth = stack * block_slices;
8063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int filter_offset = tensor_idx + start_depth;
8073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int inout_offset = batch * in_size + filter_offset;
8083f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const bool depth_in_range = start_depth < max_depth;
8093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
8113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
8123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
8133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
8143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
8153f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
8163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
8173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (filter_write_offset != 0) {
8193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_data[filter_write_offset] = ldg(filter_offset + filter);
8203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
8213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
8223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
8243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
8253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
8273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T sum1 = 0;
8283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T sum2 = 0;
8293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      int shared_offset = data_idx;
8303f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* filter_ptr = filter_read_offset + shared_data;
8313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_rows; ++r) {
8323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_cols; ++c) {
8333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          filter_ptr -= block_slices;
8343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T filter_value = *filter_ptr;
8353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
8363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum1 += filter_value * tile_ptr[0];
8373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum2 += filter_value * tile_ptr[tile_offset];
8383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          shared_offset += block_slices;
8393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
8403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_offset += in_increment;
8413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
8423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const out_ptr = inout_offset + output;
8433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      out_ptr[0] = sum1;
8443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
8453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        out_ptr[tensor_offset] = sum2;
8463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
8473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
8483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
8503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
8513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
8523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
8533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
8547828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
8557828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
8563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
857ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
858ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* out_backprop,
859ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* filter, T* in_backprop,
860ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              int num_in_backprop) {
861ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
862ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
863ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
8647828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
8657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
8667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
8677828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
8687828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
8697828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
870ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
871ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
872ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
873ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
874ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
875ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
876ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
877ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // TODO(vrv): Consider assigning threads to output and using
878ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // atomics for accumulation, similar to the filter case.
879ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
880ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the input.
881ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c = thread_id % in_cols;
882ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r = (thread_id / in_cols) % in_rows;
883ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = (thread_id / in_cols / in_rows) % in_depth;
884ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int b = thread_id / in_depth / in_cols / in_rows;
885ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
886ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    T sum = 0;
887ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d_start = in_d * depth_multiplier;
888ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d_end = out_d_start + depth_multiplier;
889ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
890ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r_start =
891ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
892ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
893ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c_start =
894ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
895ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
896ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
897ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    UNROLL for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
898ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
899ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int f_r = in_r + pad_rows - out_r * stride;
900ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_dm = out_d - out_d_start;
901ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
902ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int temp_filter_offset = filter_cols * f_r;
903ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
904ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int f_c = in_c + pad_cols - out_c * stride;
905ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
906ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              filter_dm + args.depth_multiplier *
907ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + temp_filter_offset));
908ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
909ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int out_backprop_offset =
910ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (b * out_depth * out_rows * out_cols) +
911ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (out_d * out_rows * out_cols) + (out_r * out_cols) + (out_c);
912ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
913ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(out_backprop + out_backprop_offset) *
914ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                 ldg(filter + filter_offset);
915ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
916ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
917ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
918ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_backprop_offset = (b * in_rows * in_cols * in_depth) +
919ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                   (in_d * in_rows * in_cols) +
920ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                   (in_r * in_cols) + (in_c);
921ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    in_backprop[in_backprop_offset] = sum;
922ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
923ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
924ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
9253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
9263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropInputGPUSmall(
9273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
9283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const T* filter, T* in_backprop, TensorFormat data_format) {
9293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (data_format != FORMAT_NHWC || args.depth_multiplier != 1 ||
9303f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.stride != 1 || args.in_rows > 16 || args.in_cols > 16 ||
9313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.in_rows != args.out_rows || args.in_cols != args.out_cols ||
9323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.pad_rows < 0 || args.pad_rows >= args.filter_rows ||
9333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.pad_cols < 0 || args.pad_cols >= args.filter_cols) {
9343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return false;
9353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
9363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
9373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_rows = (args.in_rows + 1) / 2;
9383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (args.filter_rows * args.filter_cols > args.in_cols * block_rows) {
9393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return false;
9403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
9413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
9423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = args.in_cols + args.filter_cols - 1;
9433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
9445cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int tile_pixels = tile_rows * tile_cols;
9455cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = args.filter_rows * args.filter_cols;
9463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  dim3 block_dim = dim3(8, args.in_cols, block_rows);
9473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int shared_memory_size =
9485cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      block_dim.x * (tile_pixels + filter_pixels) * sizeof(T);
9493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
9503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int num_in_backprop =
9513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.batch * args.in_rows * args.in_cols * args.in_depth;
9523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (args.in_rows & 1) {
9533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
9543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        num_in_backprop, d,
9553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
9563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, false>,
9573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
9583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<T, kKnownFilterWidth,
9593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                   kKnownFilterHeight, false>
9603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
9613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            args, out_backprop, filter, in_backprop);
9623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  } else {
9633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
9643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        num_in_backprop, d,
9653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<
9663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, true>,
9673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
9683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWCSmall<T, kKnownFilterWidth,
9693f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                   kKnownFilterHeight, true>
9703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
9713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            args, out_backprop, filter, in_backprop);
9723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
9733f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
9743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  return true;
9753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
9763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
9777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
9787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
9797828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
9807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const DepthwiseArgs args,
9817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* out_backprop,
9827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* filter, T* in_backprop,
9837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           TensorFormat data_format) {
9843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (TryLaunchDepthwiseConv2dBackpropInputGPUSmall<T, kKnownFilterWidth,
9853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                    kKnownFilterHeight>(
9863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          d, args, out_backprop, filter, in_backprop, data_format)) {
9873f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return;
9883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
9897828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_in_backprop =
9907828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.in_rows * args.in_cols * args.in_depth;
9917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
9923c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
9933c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_in_backprop, d,
9943c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNHWC<
9953c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
9963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
9977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWC<
9987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
9997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
10007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, filter, in_backprop, num_in_backprop);
10017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
10023c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
10033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_in_backprop, d,
10043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNCHW<
10053c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
10063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
10077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNCHW<
10087828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
10097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
10107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, filter, in_backprop, num_in_backprop);
10117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
10127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
10137828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
10147828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
10157828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
10165f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
10175f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
10185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropInputGPULaunch {
10197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args,
1020ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* out_backprop, const T* filter, T* in_backprop,
1021ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  TensorFormat data_format) {
10227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.depth_multiplier == 1) {
10237828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      if (args.filter_rows == 3 && args.filter_cols == 3) {
10247828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>(
10257828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            d, args, out_backprop, filter, in_backprop, data_format);
1026ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      } else {
10277828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>(
10287828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            d, args, out_backprop, filter, in_backprop, data_format);
1029ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
10302f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower    } else {
10317828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>(
10327828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, filter, in_backprop, data_format);
10332f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower    }
10345f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
10355f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen};
10365f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
10375f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<float>;
10385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<double>;
10395f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
10405f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
10417828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
10427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
10433c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
10443c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
10453c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
10463c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
10473c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
10483c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
10495f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_rows = args.in_rows;
10505f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_cols = args.in_cols;
10515f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
10527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
10537828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
10547828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
10557828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
10567828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
10577828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
10585f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
10595f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_rows = args.pad_rows;
10605f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_cols = args.pad_cols;
10615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_rows = args.out_rows;
10625f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_cols = args.out_cols;
10635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
10645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
10655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
10665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
10675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_d = thread_id % out_depth;
10685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c = (thread_id / out_depth) % out_cols;
10695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r = (thread_id / out_depth / out_cols) % out_rows;
10705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int b = thread_id / out_depth / out_cols / out_rows;
10715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the input depth and the index of depth multiplier.
10725f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_d = out_d / depth_multiplier;
10735f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int dm = out_d % depth_multiplier;
10745f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
1075ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
1076ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
10775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r_start = out_r * stride - pad_rows;
10785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c_start = out_c * stride - pad_cols;
10795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r_end = in_r_start + filter_rows;
10805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c_end = in_c_start + filter_cols;
10815f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
10825f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_backprop_offset =
10835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
10845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const T out_bp = ldg(out_backprop + out_backprop_offset);
10855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
10865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        in_c_end < in_cols) {
10875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
10885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = in_r_start + f_r;
10895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
10905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int input_offset_temp = in_cols * (in_r + in_rows * b);
10915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
10925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = in_c_start + f_c;
10935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
10945f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int input_offset = in_d + in_depth * (in_c + input_offset_temp);
10955f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          T partial_sum = ldg(input + input_offset) * out_bp;
10965f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          T* addr = filter_backprop +
1097ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                    (dm + depth_multiplier *
1098ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + filter_cols * f_r)));
10995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          CudaAtomicAdd(addr, partial_sum);
11005f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
11015f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
11025f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    } else {
11035f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
11045f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = in_r_start + f_r;
11055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
11065f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int input_offset_temp = in_cols * (in_r + in_rows * b);
11075f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
11085f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = in_c_start + f_c;
11095f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int addr_temp = filter_cols * f_r;
11105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
11115f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
11125f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
11135f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                in_d + in_depth * (in_c + input_offset_temp);
11145f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T partial_sum = ldg(input + input_offset) * out_bp;
11155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T* addr =
11165f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                filter_backprop +
11175f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
11185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // Potentially many threads can add to the same address so we have
11195f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // to use atomic add here.
11205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // TODO(jmchen): If atomic add turns out to be slow, we can:
11215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // 1. allocate multiple buffers for the gradients (one for each
1122ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
1123ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
1124ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
1125ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
1126ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            CudaAtomicAdd(addr, partial_sum);
1127ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
1128ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
1129ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
1130ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
1131ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
1132ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
1133ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
11343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
11353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// NCHW format, tailored for small images up to 16x16. Stride and depth
11363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME'.
11373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input tensor are loaded into shared memory before performing the
11383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// convolution. Per iteration and filter element, each thread first performs
11393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// a partial convolution for two elements, one each in the lower and upper half
11403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// of a tile. The intermediate result of 4 consecutive columns are then
11413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// accumulated and written to shared memory. Finally, the values in shared
11423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// memory are warp-accumulated (in chunks of 32 elements) and summed up in
11433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// global memory using atomics.
11443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
11453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__
11463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
11473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
11483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.x depths.
11493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
11503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
11513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batches = args.batch;
11533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_rows = args.in_rows;
11543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_cols = args.in_cols;
11553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_depth = args.in_depth;
11563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_rows =
11573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
11583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_cols =
11593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
11603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_rows = args.pad_rows;
11613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_cols = args.pad_cols;
11623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Fixed blockDim.x, corresponding to Pascal's global load granularity of 32B.
11643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_slices = 8;
11653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_rows = blockDim.z;
11663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // These values are the same for all threads and could
11683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // be precomputed on the CPU.
11695cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int block_size = block_rows * in_cols * block_slices;
11703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_row_size = in_cols * in_depth;
11713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_size = in_rows * in_row_size;
11723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_increment = (in_cols - 1) * block_slices;
11735cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = filter_rows * filter_cols;
11743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = in_cols + filter_cols - 1;
11753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = 2 * block_rows + filter_rows - 1;
11763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_row_size = tile_cols * block_slices;
11773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_size = tile_rows * tile_row_size;
11783f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_offset = block_rows * tile_row_size;
11793f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int pad_offset = pad_rows * tile_cols + pad_cols;
11803f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int batch_blocks = (in_depth + block_slices - 1) / block_slices;
11813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_blocks = batch_blocks * batches;
11823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_offset = block_rows * in_row_size;
11833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_pixels = 32;
11843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_increment = accum_pixels * block_slices;
11855cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int accum_size = filter_pixels * accum_increment;
11863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11873f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_depth = threadIdx.x;
11883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_col = threadIdx.y;
11893f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_row = threadIdx.z;
11903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11913f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in block.
11925cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int thread_pix = thread_row * in_cols + thread_col;
11933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_idx = thread_pix * block_slices + thread_depth;
11943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
11953f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Initialize tile, in particular the padding and accumulator.
11963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
11973f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    shared_data[i] = T(0);
11983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
11993f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  __syncthreads();
12003f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in tensors.
12023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_idx = thread_pix * in_depth + thread_depth;
12033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12043f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in (padded) shared memory.
12053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_pix = thread_row * tile_cols + thread_col;
12063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int data_idx = data_pix * block_slices + thread_depth;
12073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12083f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in shared memory, offset by pad_rows / pad_cols.
12093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_pix = data_pix + pad_offset;
12103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_idx = tile_pix * block_slices + thread_depth;
12113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in accumulator (1 per 4 threads, depth major).
12133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_pix = thread_pix / 4;
12143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_idx = thread_depth * accum_pixels + accum_pix;
12153f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int max_depth = in_depth - thread_depth;
12173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_offset = tile_size + accum_idx;
12183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const bool skip_second = block_rows + thread_row >= in_rows;
12193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
12213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int batch = b / batch_blocks;
12223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int stack = b - batch * batch_blocks;
12233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int start_depth = stack * block_slices;
12253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int filter_offset = tensor_idx + start_depth;
12263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int inout_offset = batch * in_size + filter_offset;
12273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const bool depth_in_range = start_depth < max_depth;
12283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
12303f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
12313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
12323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
12333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
12343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
12353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
12363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
12373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
12393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
12403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    if (depth_in_range) {
12423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const out_ptr = inout_offset + output;
12433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T out1 = ldg(out_ptr);
12443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T out2 = skip_second ? T(0) : ldg(tensor_offset + out_ptr);
12453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      int shared_offset = data_idx;
12463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* accum_ptr = accum_offset + shared_data;
12473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_rows; ++r) {
12483f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_cols; ++c) {
12493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
12503f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
12513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          val += CudaShuffleDown(val, 16);
12523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          val += CudaShuffleDown(val, 8);
12533f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          if (!(thread_idx & 24) /* i.e. 'lane_idx < 8' */) {
12543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            *accum_ptr = val;
12553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          }
12563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          shared_offset += block_slices;
12573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          accum_ptr += accum_increment;
12583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
12593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_offset += in_increment;
12603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
12613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
12623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
12643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
12653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
12663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const T* const accum_data = tile_size + shared_data;
12673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    for (int i = thread_idx; i < accum_size; i += block_size) {
12683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const int filter_idx = i / 32;
12693f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const int filter_pix = filter_idx / block_slices;
12703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const int filter_depth = filter_idx % block_slices + start_depth;
12713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const int filter_offset = filter_pix * in_depth + filter_depth;
12723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (filter_depth < in_depth) {
12733f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        T val = accum_data[i];
12743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        val += CudaShuffleDown(val, 16);
12753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        val += CudaShuffleDown(val, 8);
12763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        val += CudaShuffleDown(val, 4);
12773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        val += CudaShuffleDown(val, 2);
12783f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        val += CudaShuffleDown(val, 1);
12793f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        if (!(thread_idx & 31) /* i.e. 'lane_idx == 0' */) {
12803f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          CudaAtomicAdd(filter_offset + filter, val);
12813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
12823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
12833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
12843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
12853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
12863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
1287ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
12887828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
12897828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
12903c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
12913c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
12923c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
12933c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
12943c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
12953c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
1296ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
1297ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
1298ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
12997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
13007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
13017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
13027828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
13037828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
13047828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
1305ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
1306ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
1307ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
1308ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
1309ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
1310ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
1311ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1312ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
1313ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
1314ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c = thread_id % out_cols;
1315ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r = (thread_id / out_cols) % out_rows;
1316ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d = (thread_id / out_cols / out_rows) % out_depth;
1317ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1318ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int b = thread_id / out_depth / out_cols / out_rows;
1319ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier.
1320ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = out_d / depth_multiplier;
1321ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int dm = out_d % depth_multiplier;
1322ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1323ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
1324ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
1325ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r_start = out_r * stride - pad_rows;
1326ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c_start = out_c * stride - pad_cols;
1327ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r_end = in_r_start + filter_rows;
1328ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c_end = in_c_start + filter_cols;
1329ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1330ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_backprop_offset = (b * out_depth * out_rows * out_cols) +
1331ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                    (out_d * out_rows * out_cols) +
1332ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                    (out_r * out_cols) + (out_c);
1333ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1334ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const T out_bp = ldg(out_backprop + out_backprop_offset);
1335ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
1336ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        in_c_end < in_cols) {
1337ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
1338ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = in_r_start + f_r;
1339ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
1340ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
1341ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_d * in_rows * in_cols) +
1342ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_r * in_cols);
1343ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1344ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
1345ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = in_c_start + f_c;
1346ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int input_offset = input_offset_temp + in_c;
1347ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          T partial_sum = ldg(input + input_offset) * out_bp;
1348ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          T* addr = filter_backprop +
1349ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                    (dm + depth_multiplier *
1350ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + filter_cols * f_r)));
1351ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          CudaAtomicAdd(addr, partial_sum);
1352ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
1353ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
1354ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
1355ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
1356ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = in_r_start + f_r;
1357ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
1358ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
1359ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_d * in_rows * in_cols) +
1360ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_r * in_cols);
1361ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
1362ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = in_c_start + f_c;
1363ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int addr_temp = filter_cols * f_r;
1364ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
1365ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
1366ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int input_offset = input_offset_temp + in_c;
1367ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T partial_sum = ldg(input + input_offset) * out_bp;
1368ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T* addr =
1369ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                filter_backprop +
1370ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
1371ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // Potentially many threads can add to the same address so we have
1372ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // to use atomic add here.
1373ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // TODO(jmchen): If atomic add turns out to be slow, we can:
1374ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // 1. allocate multiple buffers for the gradients (one for each
1375ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
1376ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
1377ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
1378ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
13795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            CudaAtomicAdd(addr, partial_sum);
13805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          }
13815f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
13825f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
13835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
13845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
13855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
13865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
13873f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
13883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
13893f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const GpuDevice& d, const DepthwiseArgs args, const T* out_backprop,
13903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const T* input, T* filter_backprop, TensorFormat data_format) {
13913f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (data_format != FORMAT_NHWC || args.depth_multiplier != 1 ||
13923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.stride != 1 || args.in_rows > 16 || args.in_cols > 16 ||
13933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.in_rows != args.out_rows || args.in_cols != args.out_cols ||
13943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.pad_rows < 0 || args.pad_rows >= args.filter_rows ||
13953f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.pad_cols < 0 || args.pad_cols >= args.filter_cols) {
13963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return false;
13973f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
13983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
13993f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int lookup_table[] = {0, 3, 1, 3};
14003f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int rows_mask = lookup_table[args.in_cols & 3];
14013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int block_rows = (args.in_rows + 1) / 2 + rows_mask & ~rows_mask;
14023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_cols = args.in_cols + args.filter_cols - 1;
14033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_rows = block_rows * 2 + args.filter_rows - 1;
14045cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int tile_pixels = tile_rows * tile_cols;
14053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_size = args.filter_rows * args.filter_cols * 32;
14063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  dim3 block_dim = dim3(8, args.in_cols, block_rows);
14073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int shared_memory_size =
14085cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      block_dim.x * (tile_pixels + accum_size) * sizeof(T);
14093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
14103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (block_rows > args.in_rows ||
14113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.filter_rows * args.filter_cols > args.in_cols * block_rows ||
14123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      shared_memory_size > d.sharedMemPerBlock()) {
14133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return false;
14143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
14153f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
14163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int num_out_backprop =
14173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
14183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  CudaLaunchConfig config = GetCudaLaunchConfig(
14193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      num_out_backprop, d,
14203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<T, kKnownFilterWidth,
14213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                      kKnownFilterHeight>,
14223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      shared_memory_size, block_dim.x * block_dim.y * block_dim.z);
14233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<T, kKnownFilterWidth,
14243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                  kKnownFilterHeight>
14253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      <<<config.block_count, block_dim, shared_memory_size, d.stream()>>>(
14263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          args, out_backprop, input, filter_backprop);
14273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  return true;
14283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
14293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
14307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
14317828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
14327828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
14337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const DepthwiseArgs args,
14347828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* out_backprop,
14357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* input, T* filter_backprop,
14367828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            TensorFormat data_format) {
14373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth,
14383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                                     kKnownFilterHeight>(
14393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          d, args, out_backprop, input, filter_backprop, data_format)) {
14403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    return;
14413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
14427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_out_backprop =
14437828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
14447828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
14453c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
14463c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_out_backprop, d,
14473c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
14483c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
14493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
14507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
14517828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
14527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
14537828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, input, filter_backprop, num_out_backprop);
14547828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
14553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
14563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_out_backprop, d,
14573c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
14583c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
14593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        0, 0);
14607828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
14617828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
14627828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
14637828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, input, filter_backprop, num_out_backprop);
14647828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
14657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
14667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
14677828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
14687828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
14695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
14705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
14715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropFilterGPULaunch {
14727828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args,
1473ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* out_backprop, const T* input, T* filter_backprop,
1474ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  TensorFormat data_format) {
14757828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.filter_rows == 3 && args.filter_cols == 3 &&
14767828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        args.depth_multiplier == 1) {
14777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>(
14787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, input, filter_backprop, data_format);
1479ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
14807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>(
14817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, input, filter_backprop, data_format);
1482ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
14835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
14845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen};
14855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
14865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<float>;
14875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<double>;
1488b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}  // namespace tensorflow
1489b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#endif  // GOOGLE_CUDA
1490