depthwise_conv_op_gpu.cu.cc revision 3c02d1100788789b04e04feb93761f0ad898ea77
1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
3b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenLicensed under the Apache License, Version 2.0 (the "License");
4b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenyou may not use this file except in compliance with the License.
5b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenYou may obtain a copy of the License at
6b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
7b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    http://www.apache.org/licenses/LICENSE-2.0
8b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
9b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenUnless required by applicable law or agreed to in writing, software
10b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chendistributed under the License is distributed on an "AS IS" BASIS,
11b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenSee the License for the specific language governing permissions and
13b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenlimitations under the License.
14b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen==============================================================================*/
15b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
16b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#if GOOGLE_CUDA
17b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define EIGEN_USE_GPU
18b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
19b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/kernels/depthwise_conv_op.h"
21b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/platform/types.h"
22b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/util/cuda_kernel_helper.h"
23ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/util/tensor_format.h"
24b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
25e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#if !defined(_MSC_VER)
26b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define UNROLL _Pragma("unroll")
277828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL _Pragma("nounroll")
28e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#else
29ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#define UNROLL
307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL
31e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#endif
32b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
33b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chennamespace tensorflow {
34b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowerusing Eigen::GpuDevice;
36b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
37ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
38ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NHWC format.
397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
407828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
413c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
423c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
433c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
44b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_rows = args.in_rows;
45b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_cols = args.in_cols;
46b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_depth = args.in_depth;
477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
517828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
53b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int stride = args.stride;
54b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int pad_rows = args.pad_rows;
55b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int pad_cols = args.pad_cols;
56b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_rows = args.out_rows;
57b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_cols = args.out_cols;
58b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_depth = args.out_depth;
59b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
60b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
61b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the indexes of this thread in the output.
62b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OD = thread_id % out_depth;
63b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OC = (thread_id / out_depth) % out_cols;
64b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OR = (thread_id / out_depth / out_cols) % out_rows;
65b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int OB = thread_id / out_depth / out_cols / out_rows;
66b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the input depth and the index of depth multiplier.
67b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int in_d = OD / depth_multiplier;
68b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int multiplier = OD % depth_multiplier;
69b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
70ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
71ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
72b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_row_start = OR * stride - pad_rows;
73b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_col_start = OC * stride - pad_cols;
74b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_row_end = input_row_start + filter_rows;
75b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    const int input_col_end = input_col_start + filter_cols;
76b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    T sum = 0;
785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int input_offset_temp = in_rows * OB;
80b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    if (input_row_start >= 0 && input_col_start >= 0 &&
81b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        input_row_end < in_rows && input_col_end < in_cols) {
82b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = input_row_start + f_r;
845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int filter_offset_temp = filter_cols * f_r;
85b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = input_col_start + f_c;
875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int input_offset =
895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int filter_offset =
915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              multiplier +
925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
94b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
95b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
96b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    } else {
97b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
985f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = input_row_start + f_r;
995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int filter_offset_temp = filter_cols * f_r;
100b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
1015f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = input_col_start + f_c;
102b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
1035f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int in_c = input_col_start + f_c;
1045f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
1055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
1065f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp));
1075f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int filter_offset =
108ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                multiplier + depth_multiplier *
109ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                 (in_d + in_depth * (f_c + filter_offset_temp));
1105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
111b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen          }
112b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
113b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
114b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    }
1155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    output[thread_id] = sum;
116b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
117b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}
118ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
119ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
120ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NCHW format.
1217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
1227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
1233c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
1243c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
1253c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
126ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
127ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
128ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
1297828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
1307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
1317828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
1327828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
1337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
1347828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
135ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
136ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
137ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
138ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
139ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
140ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
141ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
142ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
143ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
144ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
145ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We want coalesced reads so we make sure that each warp reads
146ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // a contiguous chunk of memory.
147ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
148ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // THIS IS PROBABLY WRONG, we are not doing coalesced reads
149ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // into the input, because of the depth multiplier division...
150ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OC = thread_id % out_cols;
151ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OR = (thread_id / out_cols) % out_rows;
152ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OD = (thread_id / out_cols / out_rows) % out_depth;
153ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int OB = thread_id / out_cols / out_rows / out_depth;
154ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
155ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier
156ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // based off the output depth index that this thread is
157ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // computing n.
158ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = OD / depth_multiplier;
159ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int multiplier = OD % depth_multiplier;
160ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
161ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Data is stored in the following format (let's assume we
162ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // flatten the height and width into one contiguous dimension
163ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // called "P".
164ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
165ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
166ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
167ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
168ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Each row contains in_depth * in_rows * in_cols values
169ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each sample in the batch.
170ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
171ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can further flatten it into:
172ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
173ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 .....
174ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C2P1 B1C2P2 ....
175ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 .....
176ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C2P1 B2C2P2 ....
177ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
178ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // where each row is a contiguous array of all of the spatial
179ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // pixels for a given batch and input depth.  The following
180ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // loop unrolls across the filter dimensions for a given thread,
181ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // indexing into the filter value and the corresponding input
182ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // patch.
183ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
184ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can compute the index into the patch once right here.
185ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols);
186ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
187ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Finally, we can iterate over the spatial dimensions and perform the
188ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // convolution, writing into the output at the end.
189ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
190ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We perform an additional optimization, where we can determine
191ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // whether the patch fits within the image indices statically, and
192ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // avoid boundary checking within the loop.
193ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_row_start = OR * stride - pad_rows;
194ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_col_start = OC * stride - pad_cols;
195ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_row_end = input_row_start + filter_rows;
196ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int input_col_end = input_col_start + filter_cols;
197ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
198ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    T sum = 0;
199ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    if (input_row_start >= 0 && input_col_start >= 0 &&
200ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        input_row_end < in_rows && input_col_end < in_cols) {
201ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that doesn't need to check for boundary conditions.
202ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
203ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = input_row_start + f_r;
204ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_offset_temp = filter_cols * f_r;
205ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
206ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = input_col_start + f_c;
207ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
208ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int input_offset =
209ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (input_offset_temp) + (in_r * in_cols) + in_c;
210ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
211ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              multiplier +
212ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp));
213ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
214ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
215ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
216ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
217ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that needs to check for boundary conditions.
218ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
219ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = input_row_start + f_r;
220ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_offset_temp = filter_cols * f_r;
221ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
222ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = input_col_start + f_c;
223ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          // TODO(vrv): the in_r check can be done outside of this loop;
224ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          // benchmark both methods to determine the better decision.
225ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
226ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int in_c = input_col_start + f_c;
227ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
228ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // input_offset_temp indexes into the start of memory
229ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // where the spatial data starts.
230ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int input_offset =
231ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                (input_offset_temp) + (in_r * in_cols) + in_c;
232ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
233ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int filter_offset =
234ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                multiplier + depth_multiplier *
235ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                 (in_d + in_depth * (f_c + filter_offset_temp));
236ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
237ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
238ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
239ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
240ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
241ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
242ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    output[thread_id] = sum;
243ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
244ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
245ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
2467828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
2477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
2487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args,
2497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                              const T* input, const T* filter, T* output,
2507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                              TensorFormat data_format) {
2517828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_outputs =
2527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
2533c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  // The compile-time constant version runs faster with a single block.
2543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
2553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                      kKnownDepthMultiplier < 0 ||
2563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                      args.out_rows * args.out_cols <= 256
2573c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                  ? std::numeric_limits<int>::max()
2583c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                  : d.getNumCudaMultiProcessors();
2597828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
2603c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
2613c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_outputs, d,
2623c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
2633c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                     kKnownDepthMultiplier>,
2643c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
2657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
2667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                 kKnownDepthMultiplier>
2673c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        <<<std::min(max_block_count, config.block_count),
2683c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
2693c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                                     output, num_outputs);
2707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
2713c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
2723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_outputs, d,
2733c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
2743c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                     kKnownDepthMultiplier>,
2753c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
2767828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
2777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                 kKnownDepthMultiplier>
2783c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        <<<std::min(max_block_count, config.block_count),
2793c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower           config.thread_per_block, 0, d.stream()>>>(args, input, filter,
2803c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                                     output, num_outputs);
2817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
2827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
2837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
2847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
285b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
286b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
287b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate <typename T>
288b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenstruct DepthwiseConv2dGPULaunch {
2897828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input,
290ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* filter, T* output, TensorFormat data_format) {
2917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.filter_rows == 3 && args.filter_cols == 3 &&
2927828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        args.depth_multiplier == 1) {
2937828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output,
2947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           data_format);
295ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
2967828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output,
2977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                              data_format);
298ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
299b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
300b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen};
301b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
302b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<float>;
303b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<double>;
304b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
3055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
3067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
3077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3083c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
3093c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
3103c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* out_backprop,
3113c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* filter, T* in_backprop,
3123c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              int num_in_backprop) {
3135f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_rows = args.in_rows;
3145f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_cols = args.in_cols;
3155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
3167828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
3177828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
3187828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
3197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
3207828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
3217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
3225f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
3235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_rows = args.pad_rows;
3245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_cols = args.pad_cols;
3255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_rows = args.out_rows;
3265f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_cols = args.out_cols;
3275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
3285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
3295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
3305f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
3315f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_d = thread_id % in_depth;
3325f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c = (thread_id / in_depth) % in_cols;
3335f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r = (thread_id / in_depth / in_cols) % in_rows;
3345f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int b = thread_id / in_depth / in_cols / in_rows;
3355f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
3365f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    T sum = 0;
3375f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
3385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r_start =
3395f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
3405f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
3415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c_start =
3425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
3435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
3445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
3457828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
3462f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int f_r = in_r + pad_rows - out_r * stride;
3472f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int temp_out_backprop_offset =
3482f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower          out_depth * out_cols * (out_r + out_rows * b);
3492f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int temp_filter_offset = filter_cols * f_r;
3507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
3512f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        const int f_c = in_c + pad_cols - out_c * stride;
3522f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        int filter_offset =
3532f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower            depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset));
3542f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        const int out_backprop_offset =
3552f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower            out_depth * out_c + temp_out_backprop_offset;
356b286574da19e18371e759fe6b676bb07728ef9acA. Unique TensorFlower#pragma unroll 6
3572f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        for (int i = 0; i < depth_multiplier; ++i) {
3582f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower          sum += ldg(out_backprop + out_backprop_offset +
3592f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower                     in_d * depth_multiplier + i) *
3602f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower                 ldg(filter + filter_offset + i);
3615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
3625f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
3635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
3645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_backprop_offset =
3655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b));
3665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    in_backprop[in_backprop_offset] = sum;
3675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
3685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
3695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
3707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
3717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
373ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
374ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* out_backprop,
375ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* filter, T* in_backprop,
376ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              int num_in_backprop) {
377ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
378ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
379ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
3807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
3817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
3827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
3837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
3847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
3857828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
386ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
387ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
388ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
389ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
390ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
391ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
392ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
393ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // TODO(vrv): Consider assigning threads to output and using
394ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // atomics for accumulation, similar to the filter case.
395ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
396ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the input.
397ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c = thread_id % in_cols;
398ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r = (thread_id / in_cols) % in_rows;
399ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = (thread_id / in_cols / in_rows) % in_depth;
400ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int b = thread_id / in_depth / in_cols / in_rows;
401ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
402ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    T sum = 0;
403ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d_start = in_d * depth_multiplier;
404ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d_end = out_d_start + depth_multiplier;
405ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
406ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r_start =
407ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride);
408ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride);
409ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c_start =
410ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride);
411ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride);
412ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
413ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    UNROLL for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
414ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
415ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int f_r = in_r + pad_rows - out_r * stride;
416ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int filter_dm = out_d - out_d_start;
417ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
418ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int temp_filter_offset = filter_cols * f_r;
419ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
420ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int f_c = in_c + pad_cols - out_c * stride;
421ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
422ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              filter_dm + args.depth_multiplier *
423ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + temp_filter_offset));
424ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
425ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int out_backprop_offset =
426ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (b * out_depth * out_rows * out_cols) +
427ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              (out_d * out_rows * out_cols) + (out_r * out_cols) + (out_c);
428ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
429ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(out_backprop + out_backprop_offset) *
430ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                 ldg(filter + filter_offset);
431ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
432ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
433ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
434ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_backprop_offset = (b * in_rows * in_cols * in_depth) +
435ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                   (in_d * in_rows * in_cols) +
436ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                   (in_r * in_cols) + (in_c);
437ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    in_backprop[in_backprop_offset] = sum;
438ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
439ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
440ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
4417828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
4427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
4437828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d,
4447828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const DepthwiseArgs args,
4457828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* out_backprop,
4467828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* filter, T* in_backprop,
4477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           TensorFormat data_format) {
4487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_in_backprop =
4497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.in_rows * args.in_cols * args.in_depth;
4507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
4513c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
4523c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_in_backprop, d,
4533c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNHWC<
4543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
4553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
4567828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWC<
4577828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
4587828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
4597828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, filter, in_backprop, num_in_backprop);
4607828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
4613c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
4623c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_in_backprop, d,
4633c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropInputGPUKernelNCHW<
4643c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
4653c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
4667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNCHW<
4677828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
4687828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
4697828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, filter, in_backprop, num_in_backprop);
4707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
4717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
4727828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
4737828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
4747828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
4755f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
4765f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
4775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropInputGPULaunch {
4787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args,
479ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* out_backprop, const T* filter, T* in_backprop,
480ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  TensorFormat data_format) {
4817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.depth_multiplier == 1) {
4827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      if (args.filter_rows == 3 && args.filter_cols == 3) {
4837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>(
4847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            d, args, out_backprop, filter, in_backprop, data_format);
485ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      } else {
4867828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>(
4877828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            d, args, out_backprop, filter, in_backprop, data_format);
488ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
4892f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower    } else {
4907828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>(
4917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, filter, in_backprop, data_format);
4922f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower    }
4935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
4945f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen};
4955f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
4965f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<float>;
4975f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<double>;
4985f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
4995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
5007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
5017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
5023c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
5033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
5043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
5053c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
5063c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
5073c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
5085f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_rows = args.in_rows;
5095f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_cols = args.in_cols;
5105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
5117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
5127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
5137828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
5147828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
5157828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
5167828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
5175f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
5185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_rows = args.pad_rows;
5195f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int pad_cols = args.pad_cols;
5205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_rows = args.out_rows;
5215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_cols = args.out_cols;
5225f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
5235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
5255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
5265f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_d = thread_id % out_depth;
5275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_c = (thread_id / out_depth) % out_cols;
5285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_r = (thread_id / out_depth / out_cols) % out_rows;
5295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int b = thread_id / out_depth / out_cols / out_rows;
5305f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the input depth and the index of depth multiplier.
5315f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_d = out_d / depth_multiplier;
5325f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int dm = out_d % depth_multiplier;
5335f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
534ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
535ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
5365f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r_start = out_r * stride - pad_rows;
5375f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c_start = out_c * stride - pad_cols;
5385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_r_end = in_r_start + filter_rows;
5395f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_c_end = in_c_start + filter_cols;
5405f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_backprop_offset =
5425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b));
5435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const T out_bp = ldg(out_backprop + out_backprop_offset);
5445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
5455f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        in_c_end < in_cols) {
5465f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
5475f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = in_r_start + f_r;
5485f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
5495f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int input_offset_temp = in_cols * (in_r + in_rows * b);
5505f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
5515f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = in_c_start + f_c;
5525f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5535f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int input_offset = in_d + in_depth * (in_c + input_offset_temp);
5545f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          T partial_sum = ldg(input + input_offset) * out_bp;
5555f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          T* addr = filter_backprop +
556ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                    (dm + depth_multiplier *
557ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + filter_cols * f_r)));
5585f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          CudaAtomicAdd(addr, partial_sum);
5595f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
5605f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
5615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    } else {
5625f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
5635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int in_r = in_r_start + f_r;
5645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
5655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        const int input_offset_temp = in_cols * (in_r + in_rows * b);
5665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
5675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int in_c = in_c_start + f_c;
5685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int addr_temp = filter_cols * f_r;
5695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
5715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
5725f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                in_d + in_depth * (in_c + input_offset_temp);
5735f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T partial_sum = ldg(input + input_offset) * out_bp;
5745f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T* addr =
5755f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                filter_backprop +
5765f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
5775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // Potentially many threads can add to the same address so we have
5785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // to use atomic add here.
5795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // TODO(jmchen): If atomic add turns out to be slow, we can:
5805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // 1. allocate multiple buffers for the gradients (one for each
581ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
582ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
583ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
584ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
585ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            CudaAtomicAdd(addr, partial_sum);
586ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
587ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
588ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
589ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
590ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
591ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
592ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
593ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
5947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
5957828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
5963c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
5973c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
5983c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
5993c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
6003c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
6013c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
602ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_rows = args.in_rows;
603ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_cols = args.in_cols;
604ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
6057828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_rows =
6067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
6077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int filter_cols =
6087828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
6097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
6107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
611ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
612ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_rows = args.pad_rows;
613ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int pad_cols = args.pad_cols;
614ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_rows = args.out_rows;
615ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_cols = args.out_cols;
616ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
617ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
618ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
619ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
620ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_c = thread_id % out_cols;
621ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_r = (thread_id / out_cols) % out_rows;
622ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_d = (thread_id / out_cols / out_rows) % out_depth;
623ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
624ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int b = thread_id / out_depth / out_cols / out_rows;
625ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier.
626ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_d = out_d / depth_multiplier;
627ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int dm = out_d % depth_multiplier;
628ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
629ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
630ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
631ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r_start = out_r * stride - pad_rows;
632ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c_start = out_c * stride - pad_cols;
633ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_r_end = in_r_start + filter_rows;
634ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int in_c_end = in_c_start + filter_cols;
635ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
636ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const int out_backprop_offset = (b * out_depth * out_rows * out_cols) +
637ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                    (out_d * out_rows * out_cols) +
638ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                    (out_r * out_cols) + (out_c);
639ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
640ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const T out_bp = ldg(out_backprop + out_backprop_offset);
641ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows &&
642ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        in_c_end < in_cols) {
643ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
644ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = in_r_start + f_r;
645ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
646ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
647ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_d * in_rows * in_cols) +
648ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_r * in_cols);
649ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
650ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
651ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = in_c_start + f_c;
652ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int input_offset = input_offset_temp + in_c;
653ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          T partial_sum = ldg(input + input_offset) * out_bp;
654ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          T* addr = filter_backprop +
655ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                    (dm + depth_multiplier *
656ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                              (in_d + in_depth * (f_c + filter_cols * f_r)));
657ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          CudaAtomicAdd(addr, partial_sum);
658ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
659ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
660ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
661ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) {
662ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int in_r = in_r_start + f_r;
663ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
664ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        const int input_offset_temp = (b * in_depth * in_rows * in_cols) +
665ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_d * in_rows * in_cols) +
666ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                      (in_r * in_cols);
667ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) {
668ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int in_c = in_c_start + f_c;
669ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int addr_temp = filter_cols * f_r;
670ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
671ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) {
672ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int input_offset = input_offset_temp + in_c;
673ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T partial_sum = ldg(input + input_offset) * out_bp;
674ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T* addr =
675ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                filter_backprop +
676ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp)));
677ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // Potentially many threads can add to the same address so we have
678ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // to use atomic add here.
679ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // TODO(jmchen): If atomic add turns out to be slow, we can:
680ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // 1. allocate multiple buffers for the gradients (one for each
681ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
682ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
683ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
684ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
6855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            CudaAtomicAdd(addr, partial_sum);
6865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          }
6875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
6885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
6895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
6905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
6915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
6925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
6937828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
6947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
6957828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d,
6967828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const DepthwiseArgs args,
6977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* out_backprop,
6987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* input, T* filter_backprop,
6997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            TensorFormat data_format) {
7007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_out_backprop =
7017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
7027828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  if (data_format == FORMAT_NHWC) {
7033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
7043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_out_backprop, d,
7053c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropFilterGPUKernelNHWC<
7063c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
7073c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
7087828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNHWC<
7097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
7107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
7117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, input, filter_backprop, num_out_backprop);
7127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else if (data_format == FORMAT_NCHW) {
7133c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    CudaLaunchConfig config = GetCudaLaunchConfig(
7143c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        num_out_backprop, d,
7153c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        DepthwiseConv2dBackpropFilterGPUKernelNCHW<
7163c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower            T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>,
7173c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower        0);
7187828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNCHW<
7197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>
7207828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
7217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower            args, out_backprop, input, filter_backprop, num_out_backprop);
7227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  } else {
7237828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    assert(false);
7247828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
7257828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
7267828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
7275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
7285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
7295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropFilterGPULaunch {
7307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  static void Run(const GpuDevice& d, const DepthwiseArgs args,
731ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  const T* out_backprop, const T* input, T* filter_backprop,
732ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                  TensorFormat data_format) {
7337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower    if (args.filter_rows == 3 && args.filter_cols == 3 &&
7347828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower        args.depth_multiplier == 1) {
7357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>(
7367828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, input, filter_backprop, data_format);
737ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
7387828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>(
7397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          d, args, out_backprop, input, filter_backprop, data_format);
740ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
7415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
7425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen};
7435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
7445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<float>;
7455f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<double>;
746b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}  // namespace tensorflow
747b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#endif  // GOOGLE_CUDA
748