1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 3b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenLicensed under the Apache License, Version 2.0 (the "License"); 4b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenyou may not use this file except in compliance with the License. 5b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenYou may obtain a copy of the License at 6b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 7b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen http://www.apache.org/licenses/LICENSE-2.0 8b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 9b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenUnless required by applicable law or agreed to in writing, software 10b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chendistributed under the License is distributed on an "AS IS" BASIS, 11b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenSee the License for the specific language governing permissions and 13b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenlimitations under the License. 14b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen==============================================================================*/ 15b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 16b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#if GOOGLE_CUDA 17b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define EIGEN_USE_GPU 18b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 19b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 200f65c8f572201f8838189f3e3c3e455759112c14A. Unique TensorFlower#include "external/cub_archive/cub/util_ptx.cuh" 2107356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower#include "tensorflow/core/framework/op_kernel.h" 22ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/kernels/depthwise_conv_op.h" 23b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/platform/types.h" 24b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/util/cuda_kernel_helper.h" 25ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/util/tensor_format.h" 26b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 27b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#if defined(_MSC_VER) && !defined(__clang__) 28ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#define UNROLL 297828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL 30b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#else 31b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#define UNROLL _Pragma("unroll") 32b1f63441d223861f3f8aac17f85989604538dec9Loo Rong Jie#define NOUNROLL _Pragma("nounroll") 33e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#endif 34b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 35b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chennamespace tensorflow { 36b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 377828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowerusing Eigen::GpuDevice; 38b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 39f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Returns whether depthwise convolution forward or backward input pass can be 40f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// performed using the faster ('Small') variant of the kernel. 41aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlowerEIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dGPUSmall( 4245fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args) { 437fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 && 447fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower args.in_cols <= 32 && args.in_rows == args.out_rows && 45aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower args.in_cols == args.out_cols && args.pad_rows >= 0 && 46aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower args.pad_rows < args.filter_rows && args.pad_cols >= 0 && 47aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower args.pad_cols < args.filter_cols && 48aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower args.filter_rows * args.filter_cols <= 49aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower (args.in_rows + 1) / 2 * args.in_cols; 50aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower} 51aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 52f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Returns whether depthwise convolution backward filter pass can be performed 53f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// using the faster ('Small') variant of the kernel. 54f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlowerEIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall( 55a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const DepthwiseArgs& args, const int block_height) { 567fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return args.depth_multiplier == 1 && args.stride == 1 && args.in_rows <= 32 && 577fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower args.in_cols <= 32 && args.in_rows == args.out_rows && 58f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower args.in_cols == args.out_cols && args.pad_rows >= 0 && 59f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower args.pad_rows < args.filter_rows && args.pad_cols >= 0 && 60a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower args.pad_cols < args.filter_cols && block_height <= args.in_rows && 61a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower args.filter_rows * args.filter_cols <= args.in_cols * block_height; 62f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower} 63f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 64824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// The DepthwiseConv2dGPUKernels perform either forward or backprop input 65824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// convolution depending on a template argument of this enum. 66824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowerenum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; 67824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower 68ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass 69ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NHWC format. 707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2) 733c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input, 743c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* output, int num_outputs) { 75a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 76a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 77b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int in_depth = args.in_depth; 78a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 797828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 80a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 84b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int stride = args.stride; 85a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 86a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 87a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 88a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 89b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int out_depth = args.out_depth; 90b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 91b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { 92b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen // Compute the indexes of this thread in the output. 93a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel = thread_id % out_depth; 94a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col = (thread_id / out_depth) % out_width; 95a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row = (thread_id / out_depth / out_width) % out_height; 96a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / out_depth / out_width / out_height; 97b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen // Compute the input depth and the index of depth multiplier. 98a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = out_channel / depth_multiplier; 99a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int multiplier = out_channel % depth_multiplier; 100b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 101ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 102ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 103a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_row_start = out_row * stride - pad_height; 104a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_col_start = out_col * stride - pad_width; 105a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_row_end = input_row_start + filter_height; 106a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_col_end = input_col_start + filter_width; 107b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 108b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum = static_cast<T>(0); 1095f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 110a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = in_height * batch; 111b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen if (input_row_start >= 0 && input_col_start >= 0 && 112a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower input_row_end < in_height && input_col_end < in_width) { 113a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 114a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 115a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = input_row_start + filter_row; 116a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset_temp = filter_width * filter_row; 117a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 118a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 119a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 1205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 122a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel + 123a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_depth * (in_col + in_width * (in_row + input_offset_temp)); 1245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset = 1255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen multiplier + 126a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower depth_multiplier * 127a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + filter_offset_temp)); 1285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen sum += ldg(input + input_offset) * ldg(filter + filter_offset); 129b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 130b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 131b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } else { 132a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 133a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 134a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = input_row_start + filter_row; 135a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset_temp = filter_width * filter_row; 136a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 137a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 138a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 139a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row >= 0 && in_row < in_height && in_col >= 0 && 140a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col < in_width) { 141a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 1425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 144a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel + 145a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_depth * (in_col + in_width * (in_row + input_offset_temp)); 1465f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset = 147a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower multiplier + 148a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower depth_multiplier * 149a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + filter_offset_temp)); 1505f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen sum += ldg(input + input_offset) * ldg(filter + filter_offset); 151b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 152b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 153b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 154b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 1555f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen output[thread_id] = sum; 156b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 157b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen} 158ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 159827874c3071b36960f5ad614edcfcdd193692718A. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NHWC format, 1607fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// tailored for small images up to 32x32. Stride and depth multiplier must be 1. 161aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only 162aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. 1633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before 1643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration, 1653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// one each in the lower and upper half of a tile. 166824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// Backprop input direction is the same as forward direction with the filter 167824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// rotated by 180°. 168824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection, 169a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth, 170a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower bool kKnownEvenHeight> 1713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( 1723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const DepthwiseArgs args, const T* input, const T* filter, T* output) { 173aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower assert(CanLaunchDepthwiseConv2dGPUSmall(args)); 1743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Holds block plus halo and filter data for blockDim.x depths. 1753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; 1763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* const shared_data = reinterpret_cast<T*>(shared_memory); 1773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 178a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int num_batches = args.batch; 179a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 180a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 1813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int in_depth = args.in_depth; 182a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 1833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 184a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 1853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 186a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 187a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 1883f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 189428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.x == kBlockDepth); 190428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.y == args.in_cols); 191a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_height = blockDim.z; 1923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // These values are the same for all threads and could 1943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // be precomputed on the CPU. 195a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_size = block_height * in_width * kBlockDepth; 196a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_size = in_width * in_depth; 197a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_size = in_height * in_row_size; 198a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_increment = (in_width - 1) * kBlockDepth; 199a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pixels = filter_height * filter_width; 200a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = in_width + filter_width - 1; 201a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int even_height = kKnownEvenHeight || (1 & ~in_height); 202a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = in_height + filter_height - even_height; 203a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_row_size = tile_width * kBlockDepth; 204a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_size = tile_height * tile_row_size; 205a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_offset = block_height * tile_row_size; 206a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_offset = pad_height * tile_width + pad_width; 207a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth; 208a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_blocks = batch_blocks * num_batches; 2093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int tensor_offset = 210a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownEvenHeight ? in_size / 2 : block_height * in_row_size; 2113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_depth = threadIdx.x; 2133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_col = threadIdx.y; 2143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_row = threadIdx.z; 2153f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in block. 217a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_pix = thread_row * in_width + thread_col; 218a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_idx = thread_pix * kBlockDepth + thread_depth; 2193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Initialize tile, in particular the padding. 2213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower for (int i = thread_idx; i < tile_size; i += block_size) { 2223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower shared_data[i] = T(0); 2233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2243f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 2253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in tensors. 2273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int tensor_idx = thread_pix * in_depth + thread_depth; 2283f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in (padded) shared memory. 230a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_pix = thread_row * tile_width + thread_col; 231a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_idx = data_pix * kBlockDepth + thread_depth; 2323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 233a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in shared memory, offset by pad_height / pad_width. 2343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int tile_pix = data_pix + pad_offset; 235a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_idx = tile_pix * kBlockDepth + thread_depth; 2363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 237a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int max_channel = in_depth - thread_depth; 2383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int filter_write_offset = 2395cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower thread_pix < filter_pixels ? tile_size + thread_idx : 0; 240824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower const int filter_read_offset = 241824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower tile_size + thread_depth + 242a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth); 2433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const bool skip_second = 244a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower !kKnownEvenHeight && thread_row + (in_height & 1) == block_height; 2453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { 2473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int batch = b / batch_blocks; 248a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block = b - batch * batch_blocks; 2493f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 250a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int start_channel = block * kBlockDepth; 251a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset = tensor_idx + start_channel; 2523f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int inout_offset = batch * in_size + filter_offset; 253a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool channel_in_range = start_channel < max_channel; 2543f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 255a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 2563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const in_ptr = inout_offset + input; 2573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* const tile_ptr = tile_idx + shared_data; 2583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower tile_ptr[0] = ldg(in_ptr); 2593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower if (!skip_second) { 2603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr); 2613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower if (filter_write_offset != 0) { 2643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower shared_data[filter_write_offset] = ldg(filter_offset + filter); 2653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2673f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2683f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 2693f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 2703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 271a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 272b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum1 = static_cast<T>(0); 273b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum2 = static_cast<T>(0); 2743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower int shared_offset = data_idx; 2753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* filter_ptr = filter_read_offset + shared_data; 276a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int r = 0; r < filter_height; ++r) { 277a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int c = 0; c < filter_width; ++c) { 278824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower if (kDirection == DIRECTION_BACKWARD) { 279a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_ptr -= kBlockDepth; 280824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 2813f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T filter_value = *filter_ptr; 2823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const tile_ptr = shared_offset + shared_data; 2833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower sum1 += filter_value * tile_ptr[0]; 2843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower sum2 += filter_value * tile_ptr[tile_offset]; 285a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower shared_offset += kBlockDepth; 286824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower if (kDirection == DIRECTION_FORWARD) { 287a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_ptr += kBlockDepth; 288824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 2893f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower shared_offset += in_increment; 2913f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* const out_ptr = inout_offset + output; 2933f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower out_ptr[0] = sum1; 2943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower if (!skip_second) { 2953f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower out_ptr[tensor_offset] = sum2; 2963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2973f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 2983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 2993f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 3003f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 3013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 3023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower} 3033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 304ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass 305ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NCHW format. 3067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 3077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 3083c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2) 3093c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input, 3103c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* output, int num_outputs) { 311a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 312a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 313ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 314a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 3157828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 316a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 3177828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 3187828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 3197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 320ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 321a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 322a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 323a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 324a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 325ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 326ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 327ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { 328ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the output. 329ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 330ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We want coalesced reads so we make sure that each warp reads 331ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // a contiguous chunk of memory. 332ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 333ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // THIS IS PROBABLY WRONG, we are not doing coalesced reads 334ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // into the input, because of the depth multiplier division... 335a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col = thread_id % out_width; 336a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row = (thread_id / out_width) % out_height; 337a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel = (thread_id / out_width / out_height) % out_depth; 338a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / out_width / out_height / out_depth; 339ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 340ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the input depth and the index of depth multiplier 341ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // based off the output depth index that this thread is 342ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // computing n. 343a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = out_channel / depth_multiplier; 344a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int multiplier = out_channel % depth_multiplier; 345ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 346ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Data is stored in the following format (let's assume we 347ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // flatten the height and width into one contiguous dimension 348ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // called "P". 349ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 350ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 .... 351ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 .... 352ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 353a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Each row contains in_depth * in_height * in_width values 354ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each sample in the batch. 355ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 356ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We can further flatten it into: 357ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 358ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C1P1 B1C1P2 ..... 359ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C2P1 B1C2P2 .... 360ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C1P1 B2C1P2 ..... 361ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C2P1 B2C2P2 .... 362ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 363ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // where each row is a contiguous array of all of the spatial 364ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // pixels for a given batch and input depth. The following 365ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // loop unrolls across the filter dimensions for a given thread, 366ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // indexing into the filter value and the corresponding input 367ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // patch. 368ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 369ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We can compute the index into the patch once right here. 370a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = 371a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (batch * in_depth + in_channel) * (in_height * in_width); 372ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 373ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Finally, we can iterate over the spatial dimensions and perform the 374ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // convolution, writing into the output at the end. 375ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 376ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We perform an additional optimization, where we can determine 377ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // whether the patch fits within the image indices statically, and 378ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // avoid boundary checking within the loop. 379a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_row_start = out_row * stride - pad_height; 380a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_col_start = out_col * stride - pad_width; 381a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_row_end = input_row_start + filter_height; 382a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_col_end = input_col_start + filter_width; 383ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 384b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum = static_cast<T>(0); 385ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan if (input_row_start >= 0 && input_col_start >= 0 && 386a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower input_row_end < in_height && input_col_end < in_width) { 387ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Loop that doesn't need to check for boundary conditions. 388a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 389a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 390a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = input_row_start + filter_row; 391a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset_temp = filter_width * filter_row; 392a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 393a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 394a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 395ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 396ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = 397a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (input_offset_temp) + (in_row * in_width) + in_col; 398ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 399ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan multiplier + 400a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower depth_multiplier * 401a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + filter_offset_temp)); 402ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(input + input_offset) * ldg(filter + filter_offset); 403ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 404ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 405ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 406ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Loop that needs to check for boundary conditions. 407a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 408a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 409a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = input_row_start + filter_row; 410a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset_temp = filter_width * filter_row; 411a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 412a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 413a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 414a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // TODO(vrv): the in_row check can be done outside of this loop; 415ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // benchmark both methods to determine the better decision. 416a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row >= 0 && in_row < in_height && in_col >= 0 && 417a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col < in_width) { 418a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = input_col_start + filter_col; 419ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 420ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // input_offset_temp indexes into the start of memory 421ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // where the spatial data starts. 422ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = 423a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (input_offset_temp) + (in_row * in_width) + in_col; 424ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 425ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 426a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower multiplier + 427a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower depth_multiplier * 428a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + filter_offset_temp)); 429ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(input + input_offset) * ldg(filter + filter_offset); 430ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 431ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 432ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 433ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 434ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 435ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan output[thread_id] = sum; 436ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 437ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 438ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 439aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution forward pass in NCHW format, 4407fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// tailored for small images up to 32x32. Stride and depth multiplier must be 1. 441aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Padding must be 'SAME', which allows to reuse the index computation. Only 442aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// use this kernel if CanLaunchDepthwiseConv2dGPUSmall(args) returns true. 443aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// Tiles of the input and filter tensors are loaded into shared memory before 444aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// performing the convolution. Each thread handles two elements per iteration, 445aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower// one each in the lower and upper half of a tile. 446824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// Backprop input direction is the same as forward direction with the filter 447824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower// rotated by 180°. 448824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection, 449a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth, 450a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower bool kKnownEvenHeight> 451aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower__global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( 452aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const DepthwiseArgs args, const T* input, const T* filter, T* output) { 453aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower assert(CanLaunchDepthwiseConv2dGPUSmall(args)); 454aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Holds block plus halo and filter data for blockDim.z depths. 455aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; 456aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower T* const shared_data = reinterpret_cast<T*>(shared_memory); 457aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 458a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int num_batches = args.batch; 459a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 460a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 461aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int in_depth = args.in_depth; 462a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 463aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 464a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 465aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 466a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 467a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 468aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 469aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Fixed blockDim.z, tailored for maximum grid size for images of size 16x16. 470428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.x == args.in_cols); 471428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.z == kBlockDepth); 472a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_height = blockDim.y; 473aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 474aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // These values are the same for all threads and could 475aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // be precomputed on the CPU. 476a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_pixels = in_width * block_height; 477a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_size = block_pixels * kBlockDepth; 478a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_pixels = in_width * in_height; 479a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_increment = in_width - 1; 480a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pixels = filter_height * filter_width; 481a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = in_width + filter_width - 1; 482a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int even_height = kKnownEvenHeight || (1 & ~in_height); 483a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = in_height + filter_height - even_height; 484a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_pixels = tile_width * tile_height; 485a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_size = tile_pixels * kBlockDepth; 486a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_offset = block_height * tile_width; 487a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_offset = pad_height * tile_width + pad_width; 488a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_total_depth = in_depth * num_batches; 489a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth; 490aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 491aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int thread_col = threadIdx.x; 492aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int thread_row = threadIdx.y; 493aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int thread_depth = threadIdx.z; 494aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 495aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Position in block. 496a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_pix = thread_row * in_width + thread_col; 497aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int thread_idx = thread_depth * block_pixels + thread_pix; 498aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 499aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Initialize tile, in particular the padding. 500aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower for (int i = thread_idx; i < tile_size; i += block_size) { 501aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower shared_data[i] = T(0); 5023f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 503aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower __syncthreads(); 5043f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 505aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Position in tensors. 506aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int tensor_idx = thread_depth * in_pixels + thread_pix; 507aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 508aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Position in (padded) shared memory. 509a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_pix = thread_row * tile_width + thread_col; 510aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int data_idx = thread_depth * tile_pixels + data_pix; 511aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 512a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in shared memory, offset by pad_height / pad_width. 513aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int tile_idx = data_idx + pad_offset; 514aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 515aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Filter is always in HWCK format, irrespective of the input/output format. 516a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pix = thread_idx / kBlockDepth; 517a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_channel = thread_idx % kBlockDepth; 518aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int filter_idx = filter_pix * in_depth; 519aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 520a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int max_channel = in_total_depth - thread_depth; 521aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int filter_write_offset = 522aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower filter_pix < filter_pixels ? tile_size + thread_idx : 0; 523824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower const int filter_read_offset = 524824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower tile_size + thread_depth + 525a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (kDirection == DIRECTION_FORWARD ? 0 : filter_pixels * kBlockDepth); 526aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const bool skip_second = 527a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower !kKnownEvenHeight && thread_row + (in_height & 1) == block_height; 528aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 529aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { 530a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int channel = b * kBlockDepth; 531aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 532a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int inout_offset = channel * in_pixels + tensor_idx; 533a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool channel_in_range = channel < max_channel; 534aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 535a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 536aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const T* const in_ptr = inout_offset + input; 537aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower T* const tile_ptr = tile_idx + shared_data; 538aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower tile_ptr[0] = ldg(in_ptr); 539aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower if (!skip_second) { 540aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower tile_ptr[tile_offset] = ldg(block_pixels + in_ptr); 541aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 542aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 543aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 544aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower if (filter_write_offset != 0) { 545a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset = 546a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_idx + (channel + filter_channel) % in_depth; 547aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower shared_data[filter_write_offset] = ldg(filter_offset + filter); 548aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 549aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 550aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 551aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower __syncthreads(); 552aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 553a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 554b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum1 = static_cast<T>(0); 555b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum2 = static_cast<T>(0); 556aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower int shared_offset = data_idx; 557aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const T* filter_ptr = filter_read_offset + shared_data; 558a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int r = 0; r < filter_height; ++r) { 559a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int c = 0; c < filter_width; ++c) { 560824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower if (kDirection == DIRECTION_BACKWARD) { 561a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_ptr -= kBlockDepth; 562824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 563aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const T filter_value = *filter_ptr; 564aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const T* const tile_ptr = shared_offset + shared_data; 565aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower sum1 += filter_value * tile_ptr[0]; 566aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower sum2 += filter_value * tile_ptr[tile_offset]; 567aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower ++shared_offset; 568824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower if (kDirection == DIRECTION_FORWARD) { 569a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_ptr += kBlockDepth; 570824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 571aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 572aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower shared_offset += in_increment; 573aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 574aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower T* const out_ptr = inout_offset + output; 575aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower out_ptr[0] = sum1; 576aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower if (!skip_second) { 577aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower out_ptr[block_pixels] = sum2; 578aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 579aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower } 580aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 581aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 582aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower __syncthreads(); 5833f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 584aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower} 5853f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 586824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection, 587a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth, 588a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower bool kKnownEvenHeight> 589a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device, 59045fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, const T* input, 59145fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const T* filter, T* output, 592aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower TensorFormat data_format) { 593a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_height = (args.in_rows + 1) / 2; 594824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower dim3 block_dim; 595428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower int block_count; 596824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower void (*kernel)(const DepthwiseArgs, const T*, const T*, T*); 597428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower switch (data_format) { 598428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NHWC: 599428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim = dim3(kBlockDepth, args.in_cols, block_height); 600428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_count = 601428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth; 602428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = 603428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DepthwiseConv2dGPUKernelNHWCSmall<T, kDirection, kKnownFilterWidth, 604428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownFilterHeight, kBlockDepth, 605428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownEvenHeight>; 606428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 607428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW: 608428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim = dim3(args.in_cols, block_height, kBlockDepth); 609428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_count = 610428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth; 611428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = 612428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DepthwiseConv2dGPUKernelNCHWSmall<T, kDirection, kKnownFilterWidth, 613428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownFilterHeight, kBlockDepth, 614428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownEvenHeight>; 615428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 616428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW_VECT_C: 617428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported"; 618428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower return; 619824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 620a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = args.in_cols + args.filter_cols - 1; 621a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = block_height * 2 + args.filter_rows - 1; 622a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_pixels = tile_height * tile_width; 6235cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower const int filter_pixels = args.filter_rows * args.filter_cols; 624aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower const int shared_memory_size = 625a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kBlockDepth * (tile_pixels + filter_pixels) * sizeof(T); 626428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower const int num_outputs = args.out_rows * args.out_cols * block_count; 627428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( 628428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower num_outputs, device, kernel, shared_memory_size, 629428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim.x * block_dim.y * block_dim.z); 630a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kernel<<<config.block_count, block_dim, shared_memory_size, 631a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device.stream()>>>(args, input, filter, output); 632aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower} 633aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower 634824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection, 635a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kKnownFilterWidth, int kKnownFilterHeight, int kBlockDepth> 636a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device, 63745fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, const T* input, 63845fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const T* filter, T* output, 639aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower TensorFormat data_format) { 640aad2e3daff8fcd29ed8e5071d4c37a7f94a0421cA. Unique TensorFlower if (args.in_rows & 1) { 641824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth, 642a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight, kBlockDepth, false>( 643a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 6445d5d9f707f0df1083d87c415f95c22ab3999bfdeA. Unique TensorFlower } else { 645824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth, 646a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight, kBlockDepth, true>( 647a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 6487fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 6497fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower} 6507fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 651824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlowertemplate <typename T, DepthwiseConv2dDirection kDirection, 652824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower int kKnownFilterWidth, int kKnownFilterHeight> 653a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device, 65445fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, const T* input, 65545fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const T* filter, T* output, 6567fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 657a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Maximize (power of two) kBlockDepth while keeping a block within 1024 6587fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower // threads (2 pixels per thread). 6597fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const int block_pixels = (args.in_rows + 1) / 2 * args.in_cols; 6607fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (block_pixels > 256) { 661824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth, 662a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight, 2>( 663a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 6647fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else if (block_pixels > 128) { 665824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth, 666a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight, 4>( 667a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 6687fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else { 669824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, kDirection, kKnownFilterWidth, 670a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight, 8>( 671a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 6723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 6733f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower} 6743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 6757828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 6767828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 677a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& device, 678a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const DepthwiseArgs& args, const T* input, 679a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const T* filter, T* output, 6807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 681824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int); 682428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower switch (data_format) { 683428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NHWC: 684428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = 685428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight, 686428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownDepthMultiplier>; 687428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 688428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW: 689428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = 690428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight, 691428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kKnownDepthMultiplier>; 692428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 693428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW_VECT_C: 694428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported"; 695428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower return; 696824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower } 6977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int num_outputs = 6987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.batch * args.out_rows * args.out_cols * args.out_depth; 699a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower CudaLaunchConfig config = 700a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower GetCudaLaunchConfig(num_outputs, device, kernel, 0, 0); 7013c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower // The compile-time constant version runs faster with a single block. 7023c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 || 7033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower kKnownDepthMultiplier < 0 7043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower ? std::numeric_limits<int>::max() 705a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower : device.getNumCudaMultiProcessors(); 706824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower kernel<<<std::min(max_block_count, config.block_count), 707a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower config.thread_per_block, 0, device.stream()>>>(args, input, filter, 708a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower output, num_outputs); 7097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 710b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 7117fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight> 712a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& device, 713a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const DepthwiseArgs& args, const T* input, 714a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const T* filter, T* output, 7157fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 7167fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (args.depth_multiplier == 1) { 7177fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (CanLaunchDepthwiseConv2dGPUSmall(args)) { 718824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_FORWARD, kKnownFilterWidth, 719a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kKnownFilterHeight>( 720a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 7217fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return; 7227fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 7237fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 7247fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, 1>( 725a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 7267fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else { 7277fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, kKnownFilterWidth, kKnownFilterHeight, -1>( 728a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, input, filter, output, data_format); 7297fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 7307fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower} 7317fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 732b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 733b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate <typename T> 73445fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvOp<GpuDevice, T>::operator()(OpKernelContext* ctx, 73545fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, 73607356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower const T* input, 73707356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower const T* filter, T* output, 73807356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower TensorFormat data_format) { 739a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device = ctx->eigen_device<GpuDevice>(); 74007356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3) { 741a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, 3, 3>(device, args, input, filter, output, 74207356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower data_format); 74307356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower } else { 744a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, -1, -1>(device, args, input, filter, output, 7457fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower data_format); 746b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 74707356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower auto stream = ctx->op_device_context()->stream(); 74807356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower OP_REQUIRES(ctx, stream->ok(), 74907356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower errors::Internal( 75007356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower "Launch of gpu kernel for DepthwiseConv2dGPULaunch failed")); 75107356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower} 752b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 75345fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, Eigen::half>; 75445fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, float>; 75545fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvOp<GpuDevice, double>; 756b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 7575f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input. 7587828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 7597828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 7603c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 7613c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args, 7623c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 7633c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* in_backprop, 7643c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_in_backprop) { 765a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 766a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 7675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_depth = args.in_depth; 768a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 7697828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 770a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 7717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 7727828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 7737828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 7745f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int stride = args.stride; 775a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 776a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 777a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 778a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 7795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_depth = args.out_depth; 7805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 7815f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { 7825f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the indexes of this thread in the output. 783a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = thread_id % in_depth; 784a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = (thread_id / in_depth) % in_width; 785a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = (thread_id / in_depth / in_width) % in_height; 786a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / in_depth / in_width / in_height; 7875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 788b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum = static_cast<T>(0); 7895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 790a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row_start = 791a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride); 792a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row_end = 793a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_min(out_height - 1, (in_row + pad_height) / stride); 794a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col_start = 795a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_max(0, (in_col - filter_width + pad_width + stride) / stride); 796a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col_end = 797a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_min(out_width - 1, (in_col + pad_width) / stride); 798a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 799a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower NOUNROLL for (int out_row = out_row_start; out_row <= out_row_end; 800a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++out_row) { 801a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_row = in_row + pad_height - out_row * stride; 8022f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int temp_out_backprop_offset = 803a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower out_depth * out_width * (out_row + out_height * batch); 804a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int temp_filter_offset = filter_width * filter_row; 805a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower NOUNROLL for (int out_col = out_col_start; out_col <= out_col_end; 806a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++out_col) { 807a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_col = in_col + pad_width - out_col * stride; 8082f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower int filter_offset = 809a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower depth_multiplier * 810a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + temp_filter_offset)); 8112f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int out_backprop_offset = 812a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower out_depth * out_col + temp_out_backprop_offset; 813b286574da19e18371e759fe6b676bb07728ef9acA. Unique TensorFlower#pragma unroll 6 8142f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower for (int i = 0; i < depth_multiplier; ++i) { 8152f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower sum += ldg(out_backprop + out_backprop_offset + 816a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel * depth_multiplier + i) * 8172f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower ldg(filter + filter_offset + i); 8185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 8195f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 8205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 8215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_backprop_offset = 822a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel + 823a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_depth * (in_col + in_width * (in_row + in_height * batch)); 8245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_backprop[in_backprop_offset] = sum; 8255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 8265f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen} 8275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 8287828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 8297828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 8303c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 831ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args, 832ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* out_backprop, 833ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* filter, T* in_backprop, 834ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan int num_in_backprop) { 835a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 836a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 837ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 838a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 8397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 840a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 8417828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 8427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 8437828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 844ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 845a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 846a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 847a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 848a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 849ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 850ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 851ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // TODO(vrv): Consider assigning threads to output and using 852ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // atomics for accumulation, similar to the filter case. 853ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { 854ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the input. 855a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = thread_id % in_width; 856a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = (thread_id / in_width) % in_height; 857a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = (thread_id / in_width / in_height) % in_depth; 858a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / in_depth / in_width / in_height; 859ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 860b1d8c59e9b014b527fb2fbef9ce9afc14dbc4938Yifei Feng T sum = static_cast<T>(0); 861a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel_start = in_channel * depth_multiplier; 862a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel_end = out_channel_start + depth_multiplier; 863a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 864a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row_start = 865a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_max<int>(0, (in_row - filter_height + pad_height + stride) / stride); 866a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row_end = 867a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_min(out_height - 1, (in_row + pad_height) / stride); 868a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col_start = 869a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_max(0, (in_col - filter_width + pad_width + stride) / stride); 870a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col_end = 871a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower tf_min(out_width - 1, (in_col + pad_width) / stride); 872a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 873a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int out_channel = out_channel_start; 874a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower out_channel < out_channel_end; ++out_channel) { 875a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int out_row = out_row_start; out_row <= out_row_end; 876a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++out_row) { 877a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_row = in_row + pad_height - out_row * stride; 878a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_dm = out_channel - out_channel_start; 879a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 880a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int temp_filter_offset = filter_width * filter_row; 881a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower for (int out_col = out_col_start; out_col <= out_col_end; ++out_col) { 882a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_col = in_col + pad_width - out_col * stride; 883ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 884a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_dm + 885a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower args.depth_multiplier * 886a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + temp_filter_offset)); 887ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 888ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_backprop_offset = 889a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (batch * out_depth * out_height * out_width) + 890a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (out_channel * out_height * out_width) + (out_row * out_width) + 891a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (out_col); 892ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 893ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(out_backprop + out_backprop_offset) * 894ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan ldg(filter + filter_offset); 895ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 896ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 897ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 898a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_backprop_offset = (batch * in_height * in_width * in_depth) + 899a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel * in_height * in_width) + 900a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_row * in_width) + (in_col); 901ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan in_backprop[in_backprop_offset] = sum; 902ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 903ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 904ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 9057828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 9067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 907a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device, 90845fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, 9097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* out_backprop, 9107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* filter, T* in_backprop, 9117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 912824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int); 913428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower switch (data_format) { 914428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NHWC: 915428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropInputGPUKernelNHWC< 916428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>; 917428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 918428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW: 919428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW< 920428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>; 921428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 922428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW_VECT_C: 923428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported"; 924428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower return; 9257828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } 926824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower const int num_in_backprop = 927824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower args.batch * args.in_rows * args.in_cols * args.in_depth; 928824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower CudaLaunchConfig config = 929a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower GetCudaLaunchConfig(num_in_backprop, device, kernel, 0, 0); 930a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>( 931824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower args, out_backprop, filter, in_backprop, num_in_backprop); 9327828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 9337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower 9347fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight> 935a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device, 93645fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, 9377fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* out_backprop, 9387fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* filter, T* in_backprop, 9397fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 9407fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (args.depth_multiplier == 1) { 9417fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (CanLaunchDepthwiseConv2dGPUSmall(args)) { 942824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower LaunchDepthwiseConv2dGPUSmall<T, DIRECTION_BACKWARD, kKnownFilterWidth, 943824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower kKnownFilterHeight>( 944a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, filter, in_backprop, data_format); 9457fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return; 9467fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 9477fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 9487fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth, 9497fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower kKnownFilterHeight, 1>( 950a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, filter, in_backprop, data_format); 9517fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else { 9527fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, kKnownFilterWidth, 9537fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower kKnownFilterHeight, -1>( 954a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, filter, in_backprop, data_format); 9557fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 9567fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower} 9577fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 9585f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 9595f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T> 96045fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvBackpropInputOp<GpuDevice, T>::operator()( 96107356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, 96207356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower const T* filter, T* in_backprop, TensorFormat data_format) { 963a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device = ctx->eigen_device<GpuDevice>(); 96407356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3) { 96507356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3>( 966a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, filter, in_backprop, data_format); 96707356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower } else { 96807356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1>( 969a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, filter, in_backprop, data_format); 9705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 97107356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower auto stream = ctx->op_device_context()->stream(); 97207356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower OP_REQUIRES(ctx, stream->ok(), 97307356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower errors::Internal("Launch of gpu kernel for " 97407356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower "DepthwiseConv2dBackpropInp" 97507356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower "utGPULaunch failed")); 97607356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower} 9775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 97845fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, Eigen::half>; 97945fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, float>; 98045fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropInputOp<GpuDevice, double>; 9815f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 9825f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. 9837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 9847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 9853c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 9863c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args, 9873c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 9883c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* input, 9893c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T* filter_backprop, 9903c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_out_backprop) { 991a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 992a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 9935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_depth = args.in_depth; 994a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 9957828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 996a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 9977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 9987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 9997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 10005f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int stride = args.stride; 1001a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 1002a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 1003a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 1004a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 10055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_depth = args.out_depth; 10065f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 10075f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) { 10085f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the indexes of this thread in the output. 1009a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel = thread_id % out_depth; 1010a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col = (thread_id / out_depth) % out_width; 1011a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row = (thread_id / out_depth / out_width) % out_height; 1012a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / out_depth / out_width / out_height; 10135f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the input depth and the index of depth multiplier. 1014a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = out_channel / depth_multiplier; 1015a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int dm = out_channel % depth_multiplier; 10165f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1017ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 1018ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 1019a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_start = out_row * stride - pad_height; 1020a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col_start = out_col * stride - pad_width; 1021a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_end = in_row_start + filter_height; 1022a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col_end = in_col_start + filter_width; 10235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 10245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_backprop_offset = 1025a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower out_channel + 1026a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower out_depth * (out_col + out_width * (out_row + out_height * batch)); 10275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const T out_bp = ldg(out_backprop + out_backprop_offset); 1028a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height && 1029a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col_end < in_width) { 1030a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 1031a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 1032a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = in_row_start + filter_row; 10335f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Avoid repeated computation. 1034a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = in_width * (in_row + in_height * batch); 1035a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 1036a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 1037a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = in_col_start + filter_col; 10385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1039a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset = 1040a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel + in_depth * (in_col + input_offset_temp); 10415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T partial_sum = ldg(input + input_offset) * out_bp; 1042a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower T* addr = 1043a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_backprop + 1044a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (dm + depth_multiplier * 1045a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + 1046a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_depth * (filter_col + filter_width * filter_row))); 10475f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CudaAtomicAdd(addr, partial_sum); 10485f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 10495f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 10505f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } else { 1051a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 1052a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 1053a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = in_row_start + filter_row; 10545f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Avoid repeated computation. 1055a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = in_width * (in_row + in_height * batch); 1056a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 1057a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 1058a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = in_col_start + filter_col; 1059a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int addr_temp = filter_width * filter_row; 1060a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 1061a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row >= 0 && in_row < in_height && in_col >= 0 && 1062a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col < in_width) { 10635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 1064a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_channel + in_depth * (in_col + input_offset_temp); 10655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T partial_sum = ldg(input + input_offset) * out_bp; 10665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T* addr = 10675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen filter_backprop + 1068a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (dm + depth_multiplier * 1069a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + addr_temp))); 10705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Potentially many threads can add to the same address so we have 10715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // to use atomic add here. 10725f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // TODO(jmchen): If atomic add turns out to be slow, we can: 10735f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // 1. allocate multiple buffers for the gradients (one for each 1074ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // example in a batch, for example). This can reduce the 1075ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // contention on the destination; 2. Have each thread compute one 1076ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // gradient for an element in the filters. This should work well 1077ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // when the input depth is big and filter size is not too small. 1078ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CudaAtomicAdd(addr, partial_sum); 1079ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1080ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1081ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1082ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1083ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1084ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 1085ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1086a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// Device function to compute sub-warp sum reduction for a power-of-two group of 1087a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen// neighboring threads. 10880f65c8f572201f8838189f3e3c3e455759112c14A. Unique TensorFlowertemplate <int kWidth, typename T> 1089a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen__device__ __forceinline__ T WarpSumReduce(T val) { 1090a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen // support only power-of-two widths. 1091a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen assert(__popc(kWidth) == 1); 1092a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen int sub_warp = cub::LaneId() / kWidth; 1093a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen int zeros = sub_warp * kWidth; 1094a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen unsigned mask = ((1UL << kWidth) - 1) << zeros; 1095a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen for (int delta = kWidth / 2; delta > 0; delta /= 2) { 1096abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower val += CudaShuffleXorSync(mask, val, delta); 1097a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen } 1098a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen return val; 1099a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen} 1100a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen 11013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in 11027fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// NHWC format, tailored for small images up to 32x32. Stride and depth 1103f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if 1104f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CanLaunchDepthwiseConv2dGPUSmall(args) returns true. 11053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// Tiles of the input tensor are loaded into shared memory before performing the 11063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// convolution. Per iteration and filter element, each thread first performs 11073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// a partial convolution for two elements, one each in the lower and upper half 11087fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// of a tile. The intermediate result of all pixels of a warp are then 11093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower// accumulated and written to shared memory. Finally, the values in shared 1110f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed 1111f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// up in global memory using atomics. 11127fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// Requirements: threads per block must be multiple of 32 and <= launch_bounds, 1113a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth. 1114f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 1115a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kBlockDepth, int kAccumPixels> 11163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__global__ 11173f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( 11183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const DepthwiseArgs args, const T* output, const T* input, T* filter) { 1119f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z)); 11203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Holds block plus halo and filter data for blockDim.x depths. 11213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; 11223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* const shared_data = reinterpret_cast<T*>(shared_memory); 11233f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1124a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int num_batches = args.batch; 1125a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 1126a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = blockDim.y; // slower (see b/62280718): args.in_cols; 11273f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int in_depth = args.in_depth; 1128a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 11293f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 1130a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 11313f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 1132a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 1133a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 11343f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1135428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.x == kBlockDepth); 1136428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.y == args.in_cols); 1137a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_height = blockDim.z; 11383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // These values are the same for all threads and could 11403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // be precomputed on the CPU. 1141a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_size = block_height * in_width * kBlockDepth; 11427fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower assert((block_size & 31) == 0); 1143a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_size = in_width * in_depth; 1144a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_size = in_height * in_row_size; 1145a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_increment = (in_width - 1) * kBlockDepth; 1146a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pixels = filter_height * filter_width; 1147a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = in_width + filter_width - 1; 1148a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = 2 * block_height + filter_height - 1; 1149a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_row_size = tile_width * kBlockDepth; 1150a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_size = tile_height * tile_row_size; 1151a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_offset = block_height * tile_row_size; 1152a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_offset = pad_height * tile_width + pad_width; 1153a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch_blocks = (in_depth + kBlockDepth - 1) / kBlockDepth; 1154a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_blocks = batch_blocks * num_batches; 1155a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tensor_offset = block_height * in_row_size; 1156f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // The accumulator has a fixed number of pixels that can be reduced by one 1157a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written. 1158a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth); 1159a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int accum_increment = kAccumPixels * kBlockDepth; 11605cf08d9cb3d79b05ed1c41e36dfb0de934979610A. Unique TensorFlower const int accum_size = filter_pixels * accum_increment; 11613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11623f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_depth = threadIdx.x; 11633f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_col = threadIdx.y; 11643f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int thread_row = threadIdx.z; 11653f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11663f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in block. 1167a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_pix = thread_row * in_width + thread_col; 1168a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_idx = thread_pix * kBlockDepth + thread_depth; 11693f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11703f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Initialize tile, in particular the padding and accumulator. 11713f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower for (int i = thread_idx; i < tile_size + accum_size; i += block_size) { 11723f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower shared_data[i] = T(0); 11733f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 11743f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 11753f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11763f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in tensors. 11773f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int tensor_idx = thread_pix * in_depth + thread_depth; 11783f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11793f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Position in (padded) shared memory. 1180a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_pix = thread_row * tile_width + thread_col; 1181a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_idx = data_pix * kBlockDepth + thread_depth; 11823f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1183a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in shared memory, offset by pad_height / pad_width. 11843f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int tile_pix = data_pix + pad_offset; 1185a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_idx = tile_pix * kBlockDepth + thread_depth; 11863f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1187a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in accumulator (kBlockDepth per warp, depth major). 1188a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int accum_pix = thread_pix / (32 / kBlockDepth); 1189f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower const int accum_idx = thread_depth * kAccumPixels + accum_pix; 11903f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1191a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int max_channel = in_depth - thread_depth; 11923f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int accum_offset = tile_size + accum_idx; 1193a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool skip_second = block_height + thread_row >= in_height; 11943f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 11953f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { 11963f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int batch = b / batch_blocks; 1197a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block = b - batch * batch_blocks; 11983f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1199a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int start_channel = block * kBlockDepth; 1200a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset = tensor_idx + start_channel; 12013f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const int inout_offset = batch * in_size + filter_offset; 1202a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool channel_in_range = start_channel < max_channel; 12033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1204a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 12053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const in_ptr = inout_offset + input; 12063f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* const tile_ptr = tile_idx + shared_data; 12073f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower tile_ptr[0] = ldg(in_ptr); 12083f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower if (!skip_second) { 12093f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower tile_ptr[tile_offset] = ldg(tensor_offset + in_ptr); 12103f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12113f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12123f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 12133f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 12143f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 1215a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range); 12163f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1217a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 12183f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const out_ptr = inout_offset + output; 12193f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T out1 = ldg(out_ptr); 12203f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T out2 = skip_second ? T(0) : ldg(tensor_offset + out_ptr); 12213f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower int shared_offset = data_idx; 12223f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T* accum_ptr = accum_offset + shared_data; 1223a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int r = 0; r < filter_height; ++r) { 1224a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int c = 0; c < filter_width; ++c) { 12253f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const tile_ptr = shared_offset + shared_data; 12263f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; 12277fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower // Warp-accumulate pixels of the same depth and write to accumulator. 1228a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower for (int delta = 16; delta >= kBlockDepth; delta /= 2) { 1229abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower val += CudaShuffleXorSync(active_threads, val, delta); 12307fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 1231a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) { 12323f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower *accum_ptr = val; 12333f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 1234a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower shared_offset += kBlockDepth; 12353f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower accum_ptr += accum_increment; 12363f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12373f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower shared_offset += in_increment; 12383f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12393f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12403f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 12413f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 12423f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower __syncthreads(); 12433f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 12443f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* const accum_data = tile_size + shared_data; 12453f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower for (int i = thread_idx; i < accum_size; i += block_size) { 1246f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower const int filter_idx = i / kAccumPixels; 1247a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pix = filter_idx / kBlockDepth; 1248a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_channel = filter_idx % kBlockDepth + start_channel; 1249a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset = filter_pix * in_depth + filter_channel; 1250a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (filter_channel < in_depth) { 12513f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower T val = accum_data[i]; 1252f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower // Warp-accumulate the pixels of the same depth from the accumulator. 1253a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen val = WarpSumReduce<kAccumPixels>(val); 1254f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower if (!(thread_idx & kAccumPixels - 1)) { 12553f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower CudaAtomicAdd(filter_offset + filter, val); 12563f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12573f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12583f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12593f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower } 12603f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower} 12613f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 1262ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. 12637828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 12647828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 12653c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 12663c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args, 12673c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 12683c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* input, 12693c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T* filter_backprop, 12703c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_out_backprop) { 1271a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 1272a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = args.in_cols; 1273ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 1274a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 12757828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 1276a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 12777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 12787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 12797828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 1280ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 1281a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 1282a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 1283a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_height = args.out_rows; 1284a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_width = args.out_cols; 1285ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 1286ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1287ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) { 1288ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the output. 1289a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_col = thread_id % out_width; 1290a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_row = (thread_id / out_width) % out_height; 1291a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_channel = (thread_id / out_width / out_height) % out_depth; 1292ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1293a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int batch = thread_id / out_depth / out_width / out_height; 1294ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the input depth and the index of depth multiplier. 1295a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_channel = out_channel / depth_multiplier; 1296a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int dm = out_channel % depth_multiplier; 1297ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1298ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 1299ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 1300a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_start = out_row * stride - pad_height; 1301a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col_start = out_col * stride - pad_width; 1302a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row_end = in_row_start + filter_height; 1303a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col_end = in_col_start + filter_width; 1304ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1305a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int out_backprop_offset = 1306a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (batch * out_depth * out_height * out_width) + 1307a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (out_channel * out_height * out_width) + (out_row * out_width) + 1308a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (out_col); 1309ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 1310ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T out_bp = ldg(out_backprop + out_backprop_offset); 1311a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row_start >= 0 && in_col_start >= 0 && in_row_end < in_height && 1312a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col_end < in_width) { 1313a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 1314a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 1315a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = in_row_start + filter_row; 1316ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Avoid repeated computation. 1317a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = 1318a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (batch * in_depth * in_height * in_width) + 1319a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel * in_height * in_width) + (in_row * in_width); 1320a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 1321a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 1322a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 1323a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = in_col_start + filter_col; 1324a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset = input_offset_temp + in_col; 1325ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T partial_sum = ldg(input + input_offset) * out_bp; 1326a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower T* addr = 1327a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower filter_backprop + 1328a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (dm + depth_multiplier * 1329a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + 1330a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_depth * (filter_col + filter_width * filter_row))); 1331ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CudaAtomicAdd(addr, partial_sum); 1332ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1333ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 1334ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 1335a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_row = 0; filter_row < filter_height; 1336a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_row) { 1337a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_row = in_row_start + filter_row; 1338ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Avoid repeated computation. 1339a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset_temp = 1340a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (batch * in_depth * in_height * in_width) + 1341a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel * in_height * in_width) + (in_row * in_width); 1342a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int filter_col = 0; filter_col < filter_width; 1343a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower ++filter_col) { 1344a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_col = in_col_start + filter_col; 1345a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int addr_temp = filter_width * filter_row; 1346a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower 1347a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (in_row >= 0 && in_row < in_height && in_col >= 0 && 1348a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower in_col < in_width) { 1349a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int input_offset = input_offset_temp + in_col; 1350ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T partial_sum = ldg(input + input_offset) * out_bp; 1351ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T* addr = 1352ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan filter_backprop + 1353a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (dm + depth_multiplier * 1354a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (in_channel + in_depth * (filter_col + addr_temp))); 1355ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Potentially many threads can add to the same address so we have 1356ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // to use atomic add here. 1357ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // TODO(jmchen): If atomic add turns out to be slow, we can: 1358ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 1. allocate multiple buffers for the gradients (one for each 1359ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // example in a batch, for example). This can reduce the 1360ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // contention on the destination; 2. Have each thread compute one 1361ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // gradient for an element in the filters. This should work well 1362ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // when the input depth is big and filter size is not too small. 13635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CudaAtomicAdd(addr, partial_sum); 13645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 13655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 13665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 13675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 13685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 13695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen} 13705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1371f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CUDA kernel to compute the depthwise convolution backward w.r.t. filter in 13727fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// NCHW format, tailored for small images up to 32x32. Stride and depth 1373f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// multiplier must be 1. Padding must be 'SAME'. Only use this kernel if 1374f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// CanLaunchDepthwiseConv2dGPUSmall(args) returns true. 1375f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// Tiles of the input tensor are loaded into shared memory before performing the 1376f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// convolution. Per iteration and filter element, each thread first performs 1377f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// a partial convolution for two elements, one each in the lower and upper half 13787fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// of a tile. The intermediate result of all pixels of a warp are then 1379f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower// accumulated and written to shared memory. Finally, the values in shared 1380f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// memory are warp-accumulated (in chunks of kAccumPixels elements) and summed 1381f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower// up in global memory using atomics. 13827fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower// Requirements: threads per block must be multiple of 32 and <= launch_bounds, 1383a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower// kAccumPixels * 64 >= args.in_rows * args.in_cols * kBlockDepth. 1384f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 1385a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kBlockDepth, int kAccumPixels> 1386f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower__global__ 1387f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower__launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( 1388f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const DepthwiseArgs args, const T* output, const T* input, T* filter) { 1389f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x)); 1390f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Holds block plus halo and filter data for blockDim.z depths. 1391f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower extern __shared__ __align__(sizeof(T)) unsigned char shared_memory[]; 1392f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower T* const shared_data = reinterpret_cast<T*>(shared_memory); 1393f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1394a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int num_batches = args.batch; 1395a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_height = args.in_rows; 1396a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_width = blockDim.x; // slower (see b/62280718): args.in_cols; 1397f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int in_depth = args.in_depth; 1398a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_height = 1399f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 1400a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_width = 1401f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 1402a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_height = args.pad_rows; 1403a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_width = args.pad_cols; 1404f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1405428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.x == args.in_cols); 1406428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower assert(blockDim.z == kBlockDepth); 1407a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_height = blockDim.y; 1408f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1409f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // These values are the same for all threads and could 1410f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // be precomputed on the CPU. 1411a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_pixels = in_width * block_height; 1412a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_size = block_pixels * kBlockDepth; 14137fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower assert((block_size & 31) == 0); 1414a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_pixels = in_width * in_height; 1415a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_increment = in_width - 1; 1416a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pixels = filter_height * filter_width; 1417a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = in_width + filter_width - 1; 1418a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = 2 * block_height + filter_height - 1; 1419a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_pixels = tile_width * tile_height; 1420a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_size = tile_pixels * kBlockDepth; 1421a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_offset = block_height * tile_width; 1422a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int pad_offset = pad_height * tile_width + pad_width; 1423a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_total_depth = in_depth * num_batches; 1424a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int in_blocks = (in_total_depth + kBlockDepth - 1) / kBlockDepth; 1425f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // The accumulator has a fixed number of pixels that can be reduced by one 1426a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // warp. Pixels beyond ceil(in_pixels * kBlockDepth / 64) are never written. 1427a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower assert(kAccumPixels * 64 >= in_height * in_width * kBlockDepth); 1428a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int accum_increment = kAccumPixels * kBlockDepth; 1429f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int accum_size = filter_pixels * accum_increment; 1430f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1431f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int thread_col = threadIdx.x; 1432f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int thread_row = threadIdx.y; 1433f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int thread_depth = threadIdx.z; 1434f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1435f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Position in block. 1436a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int thread_pix = thread_row * in_width + thread_col; 1437f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int thread_idx = thread_depth * block_pixels + thread_pix; 1438f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1439f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Initialize tile, in particular the padding and accumulator. 1440f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower for (int i = thread_idx; i < tile_size + accum_size; i += block_size) { 1441f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower shared_data[i] = T(0); 1442f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1443f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower __syncthreads(); 1444f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1445f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Position in tensors. 1446f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int tensor_idx = thread_depth * in_pixels + thread_pix; 1447f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1448f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Position in (padded) shared memory. 1449a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int data_pix = thread_row * tile_width + thread_col; 1450f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int data_idx = thread_depth * tile_pixels + data_pix; 1451f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1452a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in shared memory, offset by pad_height / pad_width. 1453f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int tile_idx = data_idx + pad_offset; 1454f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1455a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Position in accumulator (kBlockDepth per warp, depth major). 1456a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int accum_pix = thread_pix / (32 / kBlockDepth); 1457f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower const int accum_idx = thread_depth * kAccumPixels + accum_pix; 1458f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1459a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int max_channel = in_total_depth - thread_depth; 1460f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const int accum_offset = tile_size + accum_idx; 1461a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool skip_second = block_height + thread_row >= in_height; 1462f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1463f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower for (int b = blockIdx.x; b < in_blocks; b += gridDim.x) { 1464a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int channel = b * kBlockDepth; 1465f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1466a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int inout_offset = channel * in_pixels + tensor_idx; 1467a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const bool channel_in_range = channel < max_channel; 1468f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1469a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 1470f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T* const in_ptr = inout_offset + input; 1471f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower T* const tile_ptr = tile_idx + shared_data; 1472f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower tile_ptr[0] = ldg(in_ptr); 1473f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower if (!skip_second) { 1474f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower tile_ptr[tile_offset] = ldg(block_pixels + in_ptr); 1475f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1476f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1477f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1478f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 1479f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower __syncthreads(); 1480a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range); 1481f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1482a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (channel_in_range) { 1483f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T* const out_ptr = inout_offset + output; 1484f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T out1 = ldg(out_ptr); 1485f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T out2 = skip_second ? T(0) : ldg(block_pixels + out_ptr); 1486f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower int shared_offset = data_idx; 1487f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower T* accum_ptr = accum_offset + shared_data; 1488a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int r = 0; r < filter_height; ++r) { 1489a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower UNROLL for (int c = 0; c < filter_width; ++c) { 1490f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T* const tile_ptr = shared_offset + shared_data; 1491f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower T val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; 14927fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower // Warp-accumulate pixels of the same depth and write to accumulator. 1493a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) { 1494abdc62aee1eeba32be56d761a2f9988306356084A. Unique TensorFlower val += CudaShuffleXorSync(active_threads, val, delta); 14957fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 1496a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (!(thread_idx & 32 / kBlockDepth - 1)) { 1497a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower *accum_ptr = val; // kBlockDepth threads per warp. 1498f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1499f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower ++shared_offset; 1500f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower accum_ptr += accum_increment; 1501f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1502f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower shared_offset += in_increment; 1503f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1504f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1505f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1506f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower // Note: the condition to reach this is uniform across the entire block. 1507f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower __syncthreads(); 1508f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1509f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower const T* const accum_data = tile_size + shared_data; 1510f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower for (int i = thread_idx; i < accum_size; i += block_size) { 1511f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower const int filter_idx = i / kAccumPixels; 1512a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_pix = filter_idx / kBlockDepth; 1513a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_channel = 1514a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower (channel + filter_idx % kBlockDepth) % in_depth; 1515a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int filter_offset = filter_pix * in_depth + filter_channel; 1516a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (filter_channel < in_depth) { 1517f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower T val = accum_data[i]; 1518f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower // Warp-accumulate pixels of the same depth from the accumulator. 1519a373b1f74215e44920bf9362a51bece530edf88aPatrick Nguyen val = WarpSumReduce<kAccumPixels>(val); 1520f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower if (!(thread_idx & kAccumPixels - 1)) { 1521f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower CudaAtomicAdd(filter_offset + filter, val); 1522f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1523f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1524f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1525f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1526f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower} 1527f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 1528f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 1529a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kBlockDepth, int kAccumPixels> 15307fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall( 1531a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device, const DepthwiseArgs& args, const int block_height, 15327fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* out_backprop, const T* input, T* filter_backprop, 15337fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 1534a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_width = args.in_cols + args.filter_cols - 1; 1535a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_height = block_height * 2 + args.filter_rows - 1; 1536a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int tile_pixels = tile_height * tile_width; 15377fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const int filter_pixels = args.filter_rows * args.filter_cols; 15387fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const int shared_memory_size = 1539a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kBlockDepth * (tile_pixels + filter_pixels * kAccumPixels) * sizeof(T); 1540a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (shared_memory_size > device.sharedMemPerBlock()) { 15417fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return false; 15427fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 15437fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 1544824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower dim3 block_dim; 1545428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower int block_count; 1546824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower void (*kernel)(const DepthwiseArgs, const T*, const T*, T*); 1547428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower switch (data_format) { 1548428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NHWC: 1549428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim = dim3(kBlockDepth, args.in_cols, block_height); 1550428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_count = 1551428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower args.batch * DivUp(args.out_depth, kBlockDepth) * kBlockDepth; 1552428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall< 1553428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>; 1554428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 1555428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW: 1556428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim = dim3(args.in_cols, block_height, kBlockDepth); 1557428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_count = 1558428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower DivUp(args.batch * args.out_depth, kBlockDepth) * kBlockDepth; 1559428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall< 1560428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>; 1561428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 1562428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW_VECT_C: 1563428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported"; 1564428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower return false; 1565f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower } 1566428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower const int num_out_backprop = args.out_rows * args.out_cols * block_count; 1567428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( 1568428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower num_out_backprop, device, kernel, shared_memory_size, 1569428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower block_dim.x * block_dim.y * block_dim.z); 1570a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kernel<<<config.block_count, block_dim, shared_memory_size, 1571a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device.stream()>>>(args, out_backprop, input, filter_backprop); 15727fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return true; 15737fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower} 15747fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 15757fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 1576a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int kBlockDepth> 15777fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall( 1578a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device, const DepthwiseArgs& args, const int block_height, 15797fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* out_backprop, const T* input, T* filter_backprop, 15807fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 15817fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower // Minimize (power of two) kAccumPixels, while satisfying 1582a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // kAccumPixels * 32 >= block_height * in_width * kBlockDepth. 1583a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const int block_pixels = block_height * args.in_cols * kBlockDepth; 15847fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (block_pixels > 512) { 15857fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 1586a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 32>( 1587a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 1588a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower data_format); 15897fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else if (block_pixels > 256) { 15907fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 1591a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 16>( 1592a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 1593a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower data_format); 15947fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else { 15957fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 1596a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, 8>( 1597a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 1598a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower data_format); 15997fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 1600f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower} 1601f0c4c6c3f3a7e6df4dbd98385ec96a72638d5031A. Unique TensorFlower 16028ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevantemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight> 16033f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlowerbool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall( 1604a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device, const DepthwiseArgs& args, const T* out_backprop, 16053f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower const T* input, T* filter_backprop, TensorFormat data_format) { 1606a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // Maximize (power of two) kBlockDepth while keeping a block within 1024 16077fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower // threads (2 pixels per thread). 1608a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int block_depth = 8; 1609a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int block_height = (args.in_rows + 1) / 2; 16107fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower int round_mask = 1; 1611a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower for (; block_depth > 1; block_depth /= 2) { 1612a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower // args.in_cols * block_height * kBlockDepth must be multiple of 32. 1613a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower for (; block_height * args.in_cols * block_depth & 31; 16147fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower round_mask = round_mask * 2 + 1) { 1615a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower block_height = block_height + round_mask & ~round_mask; 16167fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 1617a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower int block_size = block_height * args.in_cols * block_depth; 16187fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (block_size <= 1024) { 16197fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower break; 16207fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 16218ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan } 16228ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan 1623a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower if (!CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, block_height)) { 16248ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan return false; 16258ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan } 16268ca083610266e50e3ce8b7c4913bcf9d9f3af57fVijay Vasudevan 1627a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower switch (block_depth) { 16287fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower case 8: 16297fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 16307fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, 8>( 1631a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 16327fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower data_format); 16337fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower case 4: 16347fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 16357fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, 4>( 1636a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 16377fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower data_format); 16387fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower case 2: 16397fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return TryLaunchDepthwiseConv2dBackpropFilterGPUSmall< 16407fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, 2>( 1641a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, block_height, out_backprop, input, filter_backprop, 16427fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower data_format); 16437fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower default: 16447fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return false; 1645f105df0478cea110129811062ca3d29f289492c0A. Unique TensorFlower } 16463f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower} 16473f9b69a50f40154f6078e1610ce7d3afa94bd07cA. Unique TensorFlower 16487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 16497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 1650a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device, 165145fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, 16527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* out_backprop, 16537828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* input, T* filter_backprop, 16547828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 1655824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower void (*kernel)(const DepthwiseArgs, const T*, const T*, T*, int); 1656428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower switch (data_format) { 1657428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NHWC: 1658428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropFilterGPUKernelNHWC< 1659428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>; 1660428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 1661428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW: 1662428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW< 1663428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>; 1664428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower break; 1665428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower case FORMAT_NCHW_VECT_C: 1666428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported"; 1667428d034227c9e7b637de0194d80cac3976a37eefA. Unique TensorFlower return; 16687828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } 1669824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower const int num_out_backprop = 1670824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower args.batch * args.out_rows * args.out_cols * args.out_depth; 1671824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower CudaLaunchConfig config = 1672a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower GetCudaLaunchConfig(num_out_backprop, device, kernel, 0, 0); 1673a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower kernel<<<config.block_count, config.thread_per_block, 0, device.stream()>>>( 1674824f13801e0653d517c7f6b083295967c4c2dee8A. Unique TensorFlower args, out_backprop, input, filter_backprop, num_out_backprop); 16757828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 16767828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower 16777fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight> 1678a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device, 167945fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlower const DepthwiseArgs& args, 16807fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* out_backprop, 16817fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower const T* input, T* filter_backprop, 16827fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower TensorFormat data_format) { 16837fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (args.depth_multiplier == 1) { 16847fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower if (TryLaunchDepthwiseConv2dBackpropFilterGPUSmall<T, kKnownFilterWidth, 16857fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower kKnownFilterHeight>( 1686a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, input, filter_backprop, data_format)) { 16877fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower return; 16887fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 16897fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 16907fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth, 16917fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower kKnownFilterHeight, 1>( 1692a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, input, filter_backprop, data_format); 16937fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } else { 16947fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, kKnownFilterWidth, 16957fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower kKnownFilterHeight, -1>( 1696a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, input, filter_backprop, data_format); 16977fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower } 16987fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower} 16997fffdb236ecaf7a2f50f3363e947b19e2a5a327aA. Unique TensorFlower 17005f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 17015f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T> 170245fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowervoid LaunchDepthwiseConvBackpropFilterOp<GpuDevice, T>::operator()( 170307356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, 170407356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower const T* input, T* filter_backprop, TensorFormat data_format) { 1705a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower const GpuDevice& device = ctx->eigen_device<GpuDevice>(); 170607356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower auto stream = ctx->op_device_context()->stream(); 170707356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower 170807356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower // Initialize the results to 0. 170907356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower int num_filter_backprop = 171007356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower args.filter_rows * args.filter_cols * args.out_depth; 171107356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower perftools::gputools::DeviceMemoryBase filter_bp_ptr(filter_backprop, 171207356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower num_filter_backprop); 171307356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower stream->ThenMemset32(&filter_bp_ptr, 0, num_filter_backprop * sizeof(T)); 171407356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower 171507356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3) { 171607356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3>( 1717a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, input, filter_backprop, data_format); 171807356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower } else { 171907356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1>( 1720a1befe0603418c4a8bc3ea143bd757ac1d5a1fecA. Unique TensorFlower device, args, out_backprop, input, filter_backprop, data_format); 17215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 172207356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower OP_REQUIRES(ctx, stream->ok(), 172307356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower errors::Internal("Launch of gpu kernel for " 172407356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower "DepthwiseConv2dBackpropFil" 172507356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower "terGPULaunch failed")); 172607356b48e4b374efd406fd142faa77cfa4db05e9A. Unique TensorFlower} 17275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 172845fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, Eigen::half>; 172945fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, float>; 173045fae93d626e41c17fc988b88de0e2721771d222A. Unique TensorFlowertemplate struct LaunchDepthwiseConvBackpropFilterOp<GpuDevice, double>; 1731b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen} // namespace tensorflow 1732b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#endif // GOOGLE_CUDA 1733