core/kernels/depthwise_conv_op_gpu.cu.cc

c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenLicensed under the Apache License, Version 2.0 (the "License");
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenyou may not use this file except in compliance with the License.
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenYou may obtain a copy of the License at
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    http://www.apache.org/licenses/LICENSE-2.0
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenUnless required by applicable law or agreed to in writing, software
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chendistributed under the License is distributed on an "AS IS" BASIS,
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenSee the License for the specific language governing permissions and
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenlimitations under the License.
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen==============================================================================*/
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#if GOOGLE_CUDA
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define EIGEN_USE_GPU
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
0f65c8f572201f8838189f3e3c3e455759112c14A. Unique TensorFlower#include "external/cub_archive/cub/util_ptx.cuh"
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower#include "tensorflow/core/framework/op_kernel.h"
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/kernels/depthwise_conv_op.h"
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/platform/types.h"
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/util/cuda_kernel_helper.h"
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/util/tensor_format.h"
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#if defined(_MSC_VER) && !defined(__clang__)
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#define UNROLL
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL
b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#else
b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#define UNROLL _Pragma("unroll")
b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#define NOUNROLL _Pragma("nounroll")
e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#endif
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chennamespace tensorflow {
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowerusing Eigen::GpuDevice;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Returns whether depthwise convolution forward or backward input pass can be
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// performed using the faster ('Small') variant of the kernel.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowerEIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall(
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower    const DepthwiseArgs& args) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower         args.in_cols <= 32 && args.in_rows == args.out_rows &&
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.pad_cols < args.filter_cols &&
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower         args.filter_rows * args.filter_cols <=
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower             (args.in_rows + 1) / 2 * args.in_cols;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Returns whether depthwise convolution backward filter pass can be performed
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// using the faster ('Small') variant of the kernel.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlowerEIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const DepthwiseArgs& args, const int block_height) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 &&
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower         args.in_cols <= 32 && args.in_rows == args.out_rows &&
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower         args.in_cols == args.out_cols && args.pad_rows >= 0 &&
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower         args.pad_rows < args.filter_rows && args.pad_cols >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower         args.pad_cols < args.filter_cols && block_height <= args.in_rows &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower         args.filter_rows * args.filter_cols <= args.in_cols * block_height;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower}
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// The DepthwiseConv2dGPUKernels perform either forward or backprop input
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// convolution depending on a template argument of this enum.
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowerenum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD };
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NHWC format.
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  const int out_depth = args.out_depth;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the indexes of this thread in the output.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel = thread_id % out_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col = (thread_id / out_depth) % out_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row = (thread_id / out_depth / out_width) % out_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / out_depth / out_width / out_height;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    // Compute the input depth and the index of depth multiplier.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = out_channel / depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int multiplier = out_channel % depth_multiplier;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_row_start = out_row * stride - pad_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_col_start = out_col * stride - pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_row_end = input_row_start + filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_col_end = input_col_start + filter_width;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng    T sum = static_cast<T>(0);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_offset_temp = in_height * batch;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    if (input_row_start >= 0 && input_col_start >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        input_row_end < in_height && input_col_end < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = input_row_start + filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_offset_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = input_col_start + filter_col;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_depth * (in_col + in_width * (in_row + input_offset_temp));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          const int filter_offset =
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen              multiplier +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  (in_channel + in_depth * (filter_col + filter_offset_temp));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    } else {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = input_row_start + filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_offset_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = input_col_start + filter_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_col < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            const int in_col = input_col_start + filter_col;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                in_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                in_depth * (in_col + in_width * (in_row + input_offset_temp));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int filter_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                multiplier +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    (in_channel + in_depth * (filter_col + filter_offset_temp));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen          }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen        }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen      }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen    }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    output[thread_id] = sum;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
827874c3071b36960f5ad614edcfcdd193692718A. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NHWC format,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration,
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// one each in the lower and upper half of a tile.
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// Backprop input direction is the same as forward direction with the filter
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// rotated by 180°.
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          bool kKnownEvenHeight>
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.x depths.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int num_batches = args.batch;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.x == kBlockDepth);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.y == args.in_cols);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_height = blockDim.z;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // These values are the same for all threads and could
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // be precomputed on the CPU.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_size = block_height * in_width * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_row_size = in_width * in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_size = in_height * in_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_increment = (in_width - 1) * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_pixels = filter_height * filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = in_width + filter_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int even_height = kKnownEvenHeight || (1 & ~in_height);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = in_height + filter_height - even_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_row_size = tile_width * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_size = tile_height * tile_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_offset = block_height * tile_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_offset = pad_height * tile_width + pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_blocks = batch_blocks * num_batches;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      kKnownEvenHeight ? in_size / 2 : block_height * in_row_size;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_depth = threadIdx.x;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_col = threadIdx.y;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_row = threadIdx.z;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in block.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_pix = thread_row * in_width + thread_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Initialize tile, in particular the padding.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size; i += block_size) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    shared_data[i] = T(0);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in tensors.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_idx = thread_pix * in_depth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in (padded) shared memory.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_pix = thread_row * tile_width + thread_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_idx = data_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in shared memory, offset by pad_height / pad_width.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_pix = data_pix + pad_offset;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int max_channel = in_depth - thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int filter_write_offset =
5cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower      thread_pix < filter_pixels ? tile_size + thread_idx : 0;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  const int filter_read_offset =
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      tile_size + thread_depth +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const bool skip_second =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int batch = b / batch_blocks;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int block = b - batch * batch_blocks;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int start_channel = block * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int filter_offset = tensor_idx + start_channel;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int inout_offset = batch * in_size + filter_offset;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const bool channel_in_range = start_channel < max_channel;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (filter_write_offset != 0) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_data[filter_write_offset] = ldg(filter_offset + filter);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng      T sum1 = static_cast<T>(0);
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng      T sum2 = static_cast<T>(0);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      int shared_offset = data_idx;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* filter_ptr = filter_read_offset + shared_data;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_height; ++r) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_width; ++c) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          if (kDirection == DIRECTION_BACKWARD) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            filter_ptr -= kBlockDepth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T filter_value = *filter_ptr;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum1 += filter_value * tile_ptr[0];
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          sum2 += filter_value * tile_ptr[tile_offset];
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          shared_offset += kBlockDepth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          if (kDirection == DIRECTION_FORWARD) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            filter_ptr += kBlockDepth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_offset += in_increment;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const out_ptr = inout_offset + output;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      out_ptr[0] = sum1;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        out_ptr[tensor_offset] = sum2;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NCHW format.
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2)
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                 const T* filter, T* output, int num_outputs) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) {
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We want coalesced reads so we make sure that each warp reads
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // a contiguous chunk of memory.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // THIS IS PROBABLY WRONG, we are not doing coalesced reads
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // into the input, because of the depth multiplier division...
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col = thread_id % out_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row = (thread_id / out_width) % out_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel = (thread_id / out_width / out_height) % out_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / out_width / out_height / out_depth;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // based off the output depth index that this thread is
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // computing n.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = out_channel / depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int multiplier = out_channel % depth_multiplier;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Data is stored in the following format (let's assume we
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // flatten the height and width into one contiguous dimension
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // called "P".
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 ....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 ....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    // Each row contains in_depth * in_height * in_width values
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each sample in the batch.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can further flatten it into:
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C1P1 B1C1P2 .....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B1C2P1 B1C2P2 ....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C1P1 B2C1P2 .....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // B2C2P1 B2C2P2 ....
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // where each row is a contiguous array of all of the spatial
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // pixels for a given batch and input depth.  The following
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // loop unrolls across the filter dimensions for a given thread,
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // indexing into the filter value and the corresponding input
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // patch.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We can compute the index into the patch once right here.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_offset_temp =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        (batch * in_depth + in_channel) * (in_height * in_width);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Finally, we can iterate over the spatial dimensions and perform the
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // convolution, writing into the output at the end.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    //
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // We perform an additional optimization, where we can determine
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // whether the patch fits within the image indices statically, and
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // avoid boundary checking within the loop.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_row_start = out_row * stride - pad_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_col_start = out_col * stride - pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_row_end = input_row_start + filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int input_col_end = input_col_start + filter_width;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng    T sum = static_cast<T>(0);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    if (input_row_start >= 0 && input_col_start >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        input_row_end < in_height && input_col_end < in_width) {
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that doesn't need to check for boundary conditions.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = input_row_start + filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_offset_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = input_col_start + filter_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (input_offset_temp) + (in_row * in_width) + in_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan              multiplier +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  (in_channel + in_depth * (filter_col + filter_offset_temp));
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(input + input_offset) * ldg(filter + filter_offset);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      // Loop that needs to check for boundary conditions.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = input_row_start + filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_offset_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = input_col_start + filter_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          // TODO(vrv): the in_row check can be done outside of this loop;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          // benchmark both methods to determine the better decision.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_col < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            const int in_col = input_col_start + filter_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // input_offset_temp indexes into the start of memory
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // where the spatial data starts.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                (input_offset_temp) + (in_row * in_width) + in_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            const int filter_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                multiplier +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    (in_channel + in_depth * (filter_col + filter_offset_temp));
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            sum += ldg(input + input_offset) * ldg(filter + filter_offset);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    output[thread_id] = sum;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NCHW format,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// tailored for small images up to 32x32. Stride and depth multiplier must be 1.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration,
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// one each in the lower and upper half of a tile.
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// Backprop input direction is the same as forward direction with the filter
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// rotated by 180°.
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          bool kKnownEvenHeight>
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    const DepthwiseArgs args, const T* input, const T* filter, T* output) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dGPUSmall(args));
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.z depths.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int num_batches = args.batch;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16.
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.x == args.in_cols);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.z == kBlockDepth);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_height = blockDim.y;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // These values are the same for all threads and could
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // be precomputed on the CPU.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_pixels = in_width * block_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_size = block_pixels * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_pixels = in_width * in_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_increment = in_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_pixels = filter_height * filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = in_width + filter_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int even_height = kKnownEvenHeight || (1 & ~in_height);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = in_height + filter_height - even_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_pixels = tile_width * tile_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_size = tile_pixels * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_offset = block_height * tile_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_offset = pad_height * tile_width + pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_total_depth = in_depth * num_batches;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_col = threadIdx.x;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_row = threadIdx.y;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_depth = threadIdx.z;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in block.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_pix = thread_row * in_width + thread_col;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int thread_idx = thread_depth * block_pixels + thread_pix;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Initialize tile, in particular the padding.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size; i += block_size) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    shared_data[i] = T(0);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in tensors.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tensor_idx = thread_depth * in_pixels + thread_pix;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Position in (padded) shared memory.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_pix = thread_row * tile_width + thread_col;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int data_idx = thread_depth * tile_pixels + data_pix;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in shared memory, offset by pad_height / pad_width.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int tile_idx = data_idx + pad_offset;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  // Filter is always in HWCK format, irrespective of the input/output format.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_pix = thread_idx / kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_channel = thread_idx % kBlockDepth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_idx = filter_pix * in_depth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int max_channel = in_total_depth - thread_depth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int filter_write_offset =
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      filter_pix < filter_pixels ? tile_size + thread_idx : 0;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  const int filter_read_offset =
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      tile_size + thread_depth +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const bool skip_second =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      !kKnownEvenHeight && thread_row + (in_height & 1) == block_height;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int channel = b * kBlockDepth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int inout_offset = channel * in_pixels + tensor_idx;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const bool channel_in_range = channel < max_channel;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      if (!skip_second) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    if (filter_write_offset != 0) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          filter_idx + (channel + filter_channel) % in_depth;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      shared_data[filter_write_offset] = ldg(filter_offset + filter);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    __syncthreads();
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng      T sum1 = static_cast<T>(0);
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng      T sum2 = static_cast<T>(0);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      int shared_offset = data_idx;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      const T* filter_ptr = filter_read_offset + shared_data;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_height; ++r) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_width; ++c) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          if (kDirection == DIRECTION_BACKWARD) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            filter_ptr -= kBlockDepth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          const T filter_value = *filter_ptr;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          sum1 += filter_value * tile_ptr[0];
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          sum2 += filter_value * tile_ptr[tile_offset];
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower          ++shared_offset;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          if (kDirection == DIRECTION_FORWARD) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            filter_ptr += kBlockDepth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        shared_offset += in_increment;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      T* const out_ptr = inout_offset + output;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      out_ptr[0] = sum1;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      if (!skip_second) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower        out_ptr[block_pixels] = sum2;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower      }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower    __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          bool kKnownEvenHeight>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const DepthwiseArgs& args, const T* input,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const T* filter, T* output,
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_height = (args.in_rows + 1) / 2;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  dim3 block_dim;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  int block_count;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  switch (data_format) {
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NHWC:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim = dim3(kBlockDepth, args.in_cols, block_height);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_count =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                            kKnownFilterHeight, kBlockDepth,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                            kKnownEvenHeight>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim = dim3(args.in_cols, block_height, kBlockDepth);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_count =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                            kKnownFilterHeight, kBlockDepth,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                            kKnownEvenHeight>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW_VECT_C:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      return;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = args.in_cols + args.filter_cols - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = block_height * 2 + args.filter_rows - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_pixels = tile_height * tile_width;
5cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int filter_pixels = args.filter_rows * args.filter_cols;
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  const int shared_memory_size =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      kBlockDepth * (tile_pixels + filter_pixels) * sizeof(T);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  const int num_outputs = args.out_rows * args.out_cols * block_count;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      num_outputs, device, kernel, shared_memory_size,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim.x * block_dim.y * block_dim.z);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  kernel<<<config.block_count, block_dim, shared_memory_size,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower           device.stream()>>>(args, input, filter, output);
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower}
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const DepthwiseArgs& args, const T* input,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const T* filter, T* output,
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower                                   TensorFormat data_format) {
aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower  if (args.in_rows & 1) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  kKnownFilterHeight, kBlockDepth, false>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
5d5d9f707f0df1083d87c415f95c22ab3999bfdeA. Unique TensorFlower  } else {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  kKnownFilterHeight, kBlockDepth, true>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower}
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection,
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower          int kKnownFilterWidth, int kKnownFilterHeight>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const DepthwiseArgs& args, const T* input,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                   const T* filter, T* output,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                   TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Maximize (power of two) kBlockDepth while keeping a block within 1024
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  // threads (2 pixels per thread).
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  const int block_pixels = (args.in_rows + 1) / 2 * args.in_cols;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  if (block_pixels > 256) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  kKnownFilterHeight, 2>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else if (block_pixels > 128) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  kKnownFilterHeight, 4>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower    LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  kKnownFilterHeight, 8>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& device,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                              const DepthwiseArgs& args, const T* input,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                              const T* filter, T* output,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                              TensorFormat data_format) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  switch (data_format) {
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NHWC:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                       kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower                                       kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW_VECT_C:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      return;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  }
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int num_outputs =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  CudaLaunchConfig config =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      GetCudaLaunchConfig(num_outputs, device, kernel, 0, 0);
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  // The compile-time constant version runs faster with a single block.
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower  const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 ||
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower                                      kKnownDepthMultiplier < 0
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                  ? std::numeric_limits<int>::max()
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                  : device.getNumCudaMultiProcessors();
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  kernel<<<std::min(max_block_count, config.block_count),
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower           config.thread_per_block, 0, device.stream()>>>(args, input, filter,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                                          output, num_outputs);
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& device,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                              const DepthwiseArgs& args, const T* input,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                              const T* filter, T* output,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                              TensorFormat data_format) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  if (args.depth_multiplier == 1) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_FORWARD, kKnownFilterWidth,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                    kKnownFilterHeight>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, 1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, -1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, input, filter, output, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower}
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate <typename T>
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                                     const DepthwiseArgs& args,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                                                     const T* input,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                                                     const T* filter, T* output,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                                                     TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  if (args.filter_rows == 3 && args.filter_cols == 3) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    LaunchDepthwiseConv2dGPU<T, 3, 3>(device, args, input, filter, output,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                                      data_format);
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  } else {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    LaunchDepthwiseConv2dGPU<T, -1, -1>(device, args, input, filter, output,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                        data_format);
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen  }
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  auto stream = ctx->op_device_context()->stream();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  OP_REQUIRES(ctx, stream->ok(),
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower              errors::Internal(
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                  "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed"));
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower}
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, float>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, double>;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input.
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* out_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              const T* filter, T* in_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                              int num_in_backprop) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = thread_id % in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col = (thread_id / in_depth) % in_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row = (thread_id / in_depth / in_width) % in_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / in_depth / in_width / in_height;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng    T sum = static_cast<T>(0);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row_start =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row_end =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_min(out_height - 1, (in_row + pad_height) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col_start =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_max(0, (in_col - filter_width + pad_width + stride) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col_end =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_min(out_width - 1, (in_col + pad_width) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    NOUNROLL for (int out_row = out_row_start; out_row <= out_row_end;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++out_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_row = in_row + pad_height - out_row * stride;
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower      const int temp_out_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          out_depth * out_width * (out_row + out_height * batch);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int temp_filter_offset = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      NOUNROLL for (int out_col = out_col_start; out_col <= out_col_end;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++out_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_col = in_col + pad_width - out_col * stride;
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        int filter_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            (in_channel + in_depth * (filter_col + temp_filter_offset));
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        const int out_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            out_depth * out_col + temp_out_backprop_offset;
b286574da19e18371e759fe6b676bb07728ef9acA. Unique TensorFlower#pragma unroll 6
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower        for (int i = 0; i < depth_multiplier; ++i) {
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower          sum += ldg(out_backprop + out_backprop_offset +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                     in_channel * depth_multiplier + i) *
2f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower                 ldg(filter + filter_offset + i);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int in_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        in_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        in_depth * (in_col + in_width * (in_row + in_height * batch));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    in_backprop[in_backprop_offset] = sum;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args,
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* out_backprop,
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              const T* filter, T* in_backprop,
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                                              int num_in_backprop) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // TODO(vrv): Consider assigning threads to output and using
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  // atomics for accumulation, similar to the filter case.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) {
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the input.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col = thread_id % in_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row = (thread_id / in_width) % in_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = (thread_id / in_width / in_height) % in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / in_depth / in_width / in_height;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng    T sum = static_cast<T>(0);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel_start = in_channel * depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel_end = out_channel_start + depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row_start =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row_end =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_min(out_height - 1, (in_row + pad_height) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col_start =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_max(0, (in_col - filter_width + pad_width + stride) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col_end =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        tf_min(out_width - 1, (in_col + pad_width) / stride);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    UNROLL for (int out_channel = out_channel_start;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                out_channel < out_channel_end; ++out_channel) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int out_row = out_row_start; out_row <= out_row_end;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++out_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_row = in_row + pad_height - out_row * stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int filter_dm = out_channel - out_channel_start;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int temp_filter_offset = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        for (int out_col = out_col_start; out_col <= out_col_end; ++out_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int filter_col = in_col + pad_width - out_col * stride;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int filter_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              filter_dm +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              args.depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  (in_channel + in_depth * (filter_col + temp_filter_offset));
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          const int out_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (batch * out_depth * out_height * out_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (out_channel * out_height * out_width) + (out_row * out_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (out_col);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          sum += ldg(out_backprop + out_backprop_offset) *
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                 ldg(filter + filter_offset);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_backprop_offset = (batch * in_height * in_width * in_depth) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                   (in_channel * in_height * in_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                                   (in_row * in_width) + (in_col);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    in_backprop[in_backprop_offset] = sum;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                           const DepthwiseArgs& args,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* out_backprop,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           const T* filter, T* in_backprop,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                           TensorFormat data_format) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  switch (data_format) {
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NHWC:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropInputGPUKernelNHWC<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW_VECT_C:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      return;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  const int num_in_backprop =
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      args.batch * args.in_rows * args.in_cols * args.in_depth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  CudaLaunchConfig config =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      GetCudaLaunchConfig(num_in_backprop, device, kernel, 0, 0);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      args, out_backprop, filter, in_backprop, num_in_backprop);
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                           const DepthwiseArgs& args,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                           const T* out_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                           const T* filter, T* in_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                           TensorFormat data_format) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  if (args.depth_multiplier == 1) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    if (CanLaunchDepthwiseConv2dGPUSmall(args)) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_BACKWARD, kKnownFilterWidth,
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower                                    kKnownFilterHeight>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          device, args, out_backprop, filter, in_backprop, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                          kKnownFilterHeight, 1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, filter, in_backprop, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                          kKnownFilterHeight, -1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, filter, in_backprop, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower}
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()(
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    const T* filter, T* in_backprop, TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  if (args.filter_rows == 3 && args.filter_cols == 3) {
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, filter, in_backprop, data_format);
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  } else {
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, filter, in_backprop, data_format);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  auto stream = ctx->op_device_context()->stream();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  OP_REQUIRES(ctx, stream->ok(),
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower              errors::Internal("Launch of gpu kernel for "
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                               "DepthwiseConv2dBackpropInp"
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                               "utGPULaunch failed"));
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower}
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  const int out_depth = args.out_depth;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the indexes of this thread in the output.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel = thread_id % out_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col = (thread_id / out_depth) % out_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row = (thread_id / out_depth / out_width) % out_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / out_depth / out_width / out_height;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    // Compute the input depth and the index of depth multiplier.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = out_channel / depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int dm = out_channel % depth_multiplier;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row_start = out_row * stride - pad_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col_start = out_col * stride - pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row_end = in_row_start + filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col_end = in_col_start + filter_width;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const int out_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        out_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        out_depth * (out_col + out_width * (out_row + out_height * batch));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    const T out_bp = ldg(out_backprop + out_backprop_offset);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        in_col_end < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = in_row_start + filter_row;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int input_offset_temp = in_width * (in_row + in_height * batch);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = in_col_start + filter_col;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_channel + in_depth * (in_col + input_offset_temp);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          T partial_sum = ldg(input + input_offset) * out_bp;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          T* addr =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              filter_backprop +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (dm + depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                        (in_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                         in_depth * (filter_col + filter_width * filter_row)));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          CudaAtomicAdd(addr, partial_sum);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    } else {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = in_row_start + filter_row;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        // Avoid repeated computation.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int input_offset_temp = in_width * (in_row + in_height * batch);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = in_col_start + filter_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int addr_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_col < in_width) {
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            const int input_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                in_channel + in_depth * (in_col + input_offset_temp);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T partial_sum = ldg(input + input_offset) * out_bp;
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            T* addr =
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen                filter_backprop +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                (dm + depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                          (in_channel + in_depth * (filter_col + addr_temp)));
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // Potentially many threads can add to the same address so we have
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // to use atomic add here.
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // TODO(jmchen): If atomic add turns out to be slow, we can:
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            // 1. allocate multiple buffers for the gradients (one for each
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            CudaAtomicAdd(addr, partial_sum);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan}
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// Device function to compute sub-warp sum reduction for a power-of-two group of
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// neighboring threads.
0f65c8f572201f8838189f3e3c3e455759112c14A. Unique TensorFlowertemplate <int kWidth, typename T>
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen__device__ __forceinline__ T WarpSumReduce(T val) {
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  // support only power-of-two widths.
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  assert(__popc(kWidth) == 1);
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  int sub_warp = cub::LaneId() / kWidth;
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  int zeros = sub_warp * kWidth;
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  unsigned mask = ((1UL << kWidth) - 1) << zeros;
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  for (int delta = kWidth / 2; delta > 0; delta /= 2) {
abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower    val += CudaShuffleXorSync(mask, val, delta);
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  }
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen  return val;
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen}
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// NHWC format, tailored for small images up to 32x32. Stride and depth
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input tensor are loaded into shared memory before performing the
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// convolution. Per iteration and filter element, each thread first performs
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// a partial convolution for two elements, one each in the lower and upper half
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// of a tile. The intermediate result of all pixels of a warp are then
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// accumulated and written to shared memory. Finally, the values in shared
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// up in global memory using atomics.
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kBlockDepth, int kAccumPixels>
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.x depths.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int num_batches = args.batch;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = blockDim.y;  // slower (see b/62280718): args.in_cols;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.x == kBlockDepth);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.y == args.in_cols);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_height = blockDim.z;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // These values are the same for all threads and could
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // be precomputed on the CPU.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_size = block_height * in_width * kBlockDepth;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  assert((block_size & 31) == 0);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_row_size = in_width * in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_size = in_height * in_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_increment = (in_width - 1) * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_pixels = filter_height * filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = in_width + filter_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = 2 * block_height + filter_height - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_row_size = tile_width * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_size = tile_height * tile_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_offset = block_height * tile_row_size;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_offset = pad_height * tile_width + pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_blocks = batch_blocks * num_batches;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tensor_offset = block_height * in_row_size;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // The accumulator has a fixed number of pixels that can be reduced by one
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int accum_increment = kAccumPixels * kBlockDepth;
5cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower  const int accum_size = filter_pixels * accum_increment;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_depth = threadIdx.x;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_col = threadIdx.y;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int thread_row = threadIdx.z;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in block.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_pix = thread_row * in_width + thread_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_idx = thread_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Initialize tile, in particular the padding and accumulator.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    shared_data[i] = T(0);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in tensors.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tensor_idx = thread_pix * in_depth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  // Position in (padded) shared memory.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_pix = thread_row * tile_width + thread_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_idx = data_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in shared memory, offset by pad_height / pad_width.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int tile_pix = data_pix + pad_offset;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_idx = tile_pix * kBlockDepth + thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in accumulator (kBlockDepth per warp, depth major).
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int accum_pix = thread_pix / (32 / kBlockDepth);
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int max_channel = in_depth - thread_depth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  const int accum_offset = tile_size + accum_idx;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const bool skip_second = block_height + thread_row >= in_height;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int batch = b / batch_blocks;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int block = b - batch * batch_blocks;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int start_channel = block * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int filter_offset = tensor_idx + start_channel;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const int inout_offset = batch * in_size + filter_offset;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const bool channel_in_range = start_channel < max_channel;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      if (!skip_second) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T* const out_ptr = inout_offset + output;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T out1 = ldg(out_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      const T out2 = skip_second ? T(0) : ldg(tensor_offset + out_ptr);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      int shared_offset = data_idx;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      T* accum_ptr = accum_offset + shared_data;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_height; ++r) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_width; ++c) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          // Warp-accumulate pixels of the same depth and write to accumulator.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          for (int delta = 16; delta >= kBlockDepth; delta /= 2) {
abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower            val += CudaShuffleXorSync(active_threads, val, delta);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower            *accum_ptr = val;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          shared_offset += kBlockDepth;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          accum_ptr += accum_increment;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        shared_offset += in_increment;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    __syncthreads();
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const T* const accum_data = tile_size + shared_data;
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    for (int i = thread_idx; i < accum_size; i += block_size) {
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower      const int filter_idx = i / kAccumPixels;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_pix = filter_idx / kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_channel = filter_idx % kBlockDepth + start_channel;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_offset = filter_pix * in_depth + filter_channel;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      if (filter_channel < in_depth) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        T val = accum_data[i];
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower        // Warp-accumulate the pixels of the same depth from the accumulator.
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen        val = WarpSumReduce<kAccumPixels>(val);
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower        if (!(thread_idx & kAccumPixels - 1)) {
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower          CudaAtomicAdd(filter_offset + filter, val);
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower        }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower      }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2)
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower    DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* out_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               const T* input,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               T* filter_backprop,
3c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower                                               int num_out_backprop) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = args.in_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  const int depth_multiplier =
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower      kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int stride = args.stride;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_height = args.out_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int out_width = args.out_cols;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  const int out_depth = args.out_depth;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan  CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) {
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the indexes of this thread in the output.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_col = thread_id % out_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_row = (thread_id / out_width) % out_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_channel = (thread_id / out_width / out_height) % out_depth;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int batch = thread_id / out_depth / out_width / out_height;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Compute the input depth and the index of depth multiplier.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_channel = out_channel / depth_multiplier;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int dm = out_channel % depth_multiplier;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // Decide if all input is valid, if yes, we can skip the boundary checks
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    // for each input.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row_start = out_row * stride - pad_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col_start = out_col * stride - pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_row_end = in_row_start + filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int in_col_end = in_col_start + filter_width;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int out_backprop_offset =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        (batch * out_depth * out_height * out_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        (out_channel * out_height * out_width) + (out_row * out_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        (out_col);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    const T out_bp = ldg(out_backprop + out_backprop_offset);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        in_col_end < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = in_row_start + filter_row;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int input_offset_temp =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            (batch * in_depth * in_height * in_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            (in_channel * in_height * in_width) + (in_row * in_width);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = in_col_start + filter_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int input_offset = input_offset_temp + in_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          T partial_sum = ldg(input + input_offset) * out_bp;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          T* addr =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              filter_backprop +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              (dm + depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                        (in_channel +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                         in_depth * (filter_col + filter_width * filter_row)));
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan          CudaAtomicAdd(addr, partial_sum);
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan      }
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan    } else {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int filter_row = 0; filter_row < filter_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                  ++filter_row) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int in_row = in_row_start + filter_row;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan        // Avoid repeated computation.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        const int input_offset_temp =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            (batch * in_depth * in_height * in_width) +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            (in_channel * in_height * in_width) + (in_row * in_width);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int filter_col = 0; filter_col < filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                    ++filter_col) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int in_col = in_col_start + filter_col;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          const int addr_temp = filter_width * filter_row;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (in_row >= 0 && in_row < in_height && in_col >= 0 &&
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower              in_col < in_width) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            const int input_offset = input_offset_temp + in_col;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T partial_sum = ldg(input + input_offset) * out_bp;
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            T* addr =
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan                filter_backprop +
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                (dm + depth_multiplier *
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower                          (in_channel + in_depth * (filter_col + addr_temp)));
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // Potentially many threads can add to the same address so we have
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // to use atomic add here.
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // TODO(jmchen): If atomic add turns out to be slow, we can:
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // 1. allocate multiple buffers for the gradients (one for each
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // example in a batch, for example). This can reduce the
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // contention on the destination; 2. Have each thread compute one
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // gradient for an element in the filters. This should work well
ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan            // when the input depth is big and filter size is not too small.
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen            CudaAtomicAdd(addr, partial_sum);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen          }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen        }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen      }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen    }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// NCHW format, tailored for small images up to 32x32. Stride and depth
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CanLaunchDepthwiseConv2dGPUSmall(args) returns true.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Tiles of the input tensor are loaded into shared memory before performing the
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// convolution. Per iteration and filter element, each thread first performs
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// a partial convolution for two elements, one each in the lower and upper half
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// of a tile. The intermediate result of all pixels of a warp are then
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// accumulated and written to shared memory. Finally, the values in shared
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// up in global memory using atomics.
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// Requirements: threads per block must be multiple of 32 and <= launch_bounds,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth.
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kBlockDepth, int kAccumPixels>
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower__global__
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    const DepthwiseArgs args, const T* output, const T* input, T* filter) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // Holds block plus halo and filter data for blockDim.z depths.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[];
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  T* const shared_data = reinterpret_cast<T*>(shared_memory);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int num_batches = args.batch;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_height = args.in_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_width = blockDim.x;  // slower (see b/62280718): args.in_cols;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int in_depth = args.in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_height =
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_width =
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_height = args.pad_rows;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_width = args.pad_cols;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.x == args.in_cols);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  assert(blockDim.z == kBlockDepth);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_height = blockDim.y;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // These values are the same for all threads and could
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // be precomputed on the CPU.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_pixels = in_width * block_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_size = block_pixels * kBlockDepth;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  assert((block_size & 31) == 0);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_pixels = in_width * in_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_increment = in_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int filter_pixels = filter_height * filter_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = in_width + filter_width - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = 2 * block_height + filter_height - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_pixels = tile_width * tile_height;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_size = tile_pixels * kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_offset = block_height * tile_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int pad_offset = pad_height * tile_width + pad_width;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_total_depth = in_depth * num_batches;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // The accumulator has a fixed number of pixels that can be reduced by one
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int accum_increment = kAccumPixels * kBlockDepth;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int accum_size = filter_pixels * accum_increment;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int thread_col = threadIdx.x;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int thread_row = threadIdx.y;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int thread_depth = threadIdx.z;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // Position in block.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int thread_pix = thread_row * in_width + thread_col;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int thread_idx = thread_depth * block_pixels + thread_pix;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // Initialize tile, in particular the padding and accumulator.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  for (int i = thread_idx; i < tile_size + accum_size; i += block_size) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    shared_data[i] = T(0);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  __syncthreads();
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // Position in tensors.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int tensor_idx = thread_depth * in_pixels + thread_pix;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  // Position in (padded) shared memory.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int data_pix = thread_row * tile_width + thread_col;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int data_idx = thread_depth * tile_pixels + data_pix;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in shared memory, offset by pad_height / pad_width.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int tile_idx = data_idx + pad_offset;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Position in accumulator (kBlockDepth per warp, depth major).
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int accum_pix = thread_pix / (32 / kBlockDepth);
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower  const int accum_idx = thread_depth * kAccumPixels + accum_pix;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int max_channel = in_total_depth - thread_depth;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  const int accum_offset = tile_size + accum_idx;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const bool skip_second = block_height + thread_row >= in_height;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int channel = b * kBlockDepth;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const int inout_offset = channel * in_pixels + tensor_idx;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const bool channel_in_range = channel < max_channel;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      const T* const in_ptr = inout_offset + input;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      T* const tile_ptr = tile_idx + shared_data;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      tile_ptr[0] = ldg(in_ptr);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      if (!skip_second) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower        tile_ptr[tile_offset] = ldg(block_pixels + in_ptr);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    __syncthreads();
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    if (channel_in_range) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      const T* const out_ptr = inout_offset + output;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      const T out1 = ldg(out_ptr);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      const T out2 = skip_second ? T(0) : ldg(block_pixels + out_ptr);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      int shared_offset = data_idx;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      T* accum_ptr = accum_offset + shared_data;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      UNROLL for (int r = 0; r < filter_height; ++r) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        UNROLL for (int c = 0; c < filter_width; ++c) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          const T* const tile_ptr = shared_offset + shared_data;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          // Warp-accumulate pixels of the same depth and write to accumulator.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) {
abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower            val += CudaShuffleXorSync(active_threads, val, delta);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          if (!(thread_idx & 32 / kBlockDepth - 1)) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            *accum_ptr = val;  // kBlockDepth threads per warp.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          ++shared_offset;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          accum_ptr += accum_increment;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower        }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower        shared_offset += in_increment;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    // Note: the condition to reach this is uniform across the entire block.
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    __syncthreads();
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    const T* const accum_data = tile_size + shared_data;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    for (int i = thread_idx; i < accum_size; i += block_size) {
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower      const int filter_idx = i / kAccumPixels;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_pix = filter_idx / kBlockDepth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_channel =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          (channel + filter_idx % kBlockDepth) % in_depth;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      const int filter_offset = filter_pix * in_depth + filter_channel;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      if (filter_channel < in_depth) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower        T val = accum_data[i];
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower        // Warp-accumulate pixels of the same depth from the accumulator.
a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen        val = WarpSumReduce<kAccumPixels>(val);
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower        if (!(thread_idx & kAccumPixels - 1)) {
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower          CudaAtomicAdd(filter_offset + filter, val);
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower        }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower      }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower    }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower}
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kBlockDepth, int kAccumPixels>
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const GpuDevice& device, const DepthwiseArgs& args, const int block_height,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    const T* out_backprop, const T* input, T* filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_width = args.in_cols + args.filter_cols - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_height = block_height * 2 + args.filter_rows - 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int tile_pixels = tile_height * tile_width;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  const int filter_pixels = args.filter_rows * args.filter_cols;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  const int shared_memory_size =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      kBlockDepth * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(T);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  if (shared_memory_size > device.sharedMemPerBlock()) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    return false;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  dim3 block_dim;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  int block_count;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  switch (data_format) {
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NHWC:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim = dim3(kBlockDepth, args.in_cols, block_height);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_count =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim = dim3(args.in_cols, block_height, kBlockDepth);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_count =
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW_VECT_C:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      return false;
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower  }
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  const int num_out_backprop = args.out_rows * args.out_cols * block_count;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize(
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      num_out_backprop, device, kernel, shared_memory_size,
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      block_dim.x * block_dim.y * block_dim.z);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  kernel<<<config.block_count, block_dim, shared_memory_size,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower           device.stream()>>>(args, out_backprop, input, filter_backprop);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  return true;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower}
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          int kBlockDepth>
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const GpuDevice& device, const DepthwiseArgs& args, const int block_height,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    const T* out_backprop, const T* input, T* filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    TensorFormat data_format) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  // Minimize (power of two) kAccumPixels, while satisfying
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // kAccumPixels * 32 >= block_height * in_width * kBlockDepth.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const int block_pixels = block_height * args.in_cols * kBlockDepth;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  if (block_pixels > 512) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 32>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, block_height, out_backprop, input, filter_backprop,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else if (block_pixels > 256) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 16>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, block_height, out_backprop, input, filter_backprop,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 8>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, block_height, out_backprop, input, filter_backprop,
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower}
f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevantemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    const GpuDevice& device, const DepthwiseArgs& args, const T* out_backprop,
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower    const T* input, T* filter_backprop, TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  // Maximize (power of two) kBlockDepth while keeping a block within 1024
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  // threads (2 pixels per thread).
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  int block_depth = 8;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  int block_height = (args.in_rows + 1) / 2;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  int round_mask = 1;
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  for (; block_depth > 1; block_depth /= 2) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    // args.in_cols * block_height * kBlockDepth must be multiple of 32.
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    for (; block_height * args.in_cols * block_depth & 31;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower         round_mask = round_mask * 2 + 1) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      block_height = block_height + round_mask & ~round_mask;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    }
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower    int block_size = block_height * args.in_cols * block_depth;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    if (block_size <= 1024) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      break;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    }
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan  }
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_height)) {
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan    return false;
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan  }
8ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  switch (block_depth) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    case 8:
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, 8>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          device, args, block_height, out_backprop, input, filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    case 4:
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, 4>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          device, args, block_height, out_backprop, input, filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    case 2:
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, 2>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower          device, args, block_height, out_backprop, input, filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower          data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    default:
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return false;
f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower  }
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower}
3f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower          int kKnownDepthMultiplier>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                            const DepthwiseArgs& args,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* out_backprop,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            const T* input, T* filter_backprop,
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower                                            TensorFormat data_format) {
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int);
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower  switch (data_format) {
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NHWC:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWC<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW<
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower          T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      break;
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower    case FORMAT_NCHW_VECT_C:
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower      return;
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower  }
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  const int num_out_backprop =
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      args.batch * args.out_rows * args.out_cols * args.out_depth;
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower  CudaLaunchConfig config =
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower      GetCudaLaunchConfig(num_out_backprop, device, kernel, 0, 0);
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>(
824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower      args, out_backprop, input, filter_backprop, num_out_backprop);
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower}
7828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight>
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower                                            const DepthwiseArgs& args,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                            const T* out_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                            const T* input, T* filter_backprop,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                            TensorFormat data_format) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  if (args.depth_multiplier == 1) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                                       kKnownFilterHeight>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower            device, args, out_backprop, input, filter_backprop, data_format)) {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower      return;
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                           kKnownFilterHeight, 1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, input, filter_backprop, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  } else {
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower    LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth,
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower                                           kKnownFilterHeight, -1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, input, filter_backprop, data_format);
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower  }
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower}
7fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution.
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T>
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()(
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    const T* input, T* filter_backprop, TensorFormat data_format) {
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower  const GpuDevice& device = ctx->eigen_device<GpuDevice>();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  auto stream = ctx->op_device_context()->stream();
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  // Initialize the results to 0.
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  int num_filter_backprop =
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower      args.filter_rows * args.filter_cols * args.out_depth;
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop,
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                                                      num_filter_backprop);
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T));
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  if (args.filter_rows == 3 && args.filter_cols == 3) {
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, input, filter_backprop, data_format);
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  } else {
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower    LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>(
a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower        device, args, out_backprop, input, filter_backprop, data_format);
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen  }
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower  OP_REQUIRES(ctx, stream->ok(),
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower              errors::Internal("Launch of gpu kernel for "
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                               "DepthwiseConv2dBackpropFil"
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower                               "terGPULaunch failed"));
07356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower}
5f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>;
45fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>;
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}  // namespace tensorflow
b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#endif  // GOOGLE_CUDA