depthwise_conv_op_gpu.cu.cc revision 3c02d1100788789b04e04feb93761f0ad898ea77
1c8b59c046895fa5b6d79f73e0b5817330fcfbfc1A. Unique TensorFlower/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 3b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenLicensed under the Apache License, Version 2.0 (the "License"); 4b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenyou may not use this file except in compliance with the License. 5b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenYou may obtain a copy of the License at 6b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 7b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen http://www.apache.org/licenses/LICENSE-2.0 8b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 9b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenUnless required by applicable law or agreed to in writing, software 10b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chendistributed under the License is distributed on an "AS IS" BASIS, 11b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin ChenSee the License for the specific language governing permissions and 13b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenlimitations under the License. 14b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen==============================================================================*/ 15b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 16b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#if GOOGLE_CUDA 17b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define EIGEN_USE_GPU 18b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 19b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 20ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/kernels/depthwise_conv_op.h" 21b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/platform/types.h" 22b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#include "tensorflow/core/util/cuda_kernel_helper.h" 23ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#include "tensorflow/core/util/tensor_format.h" 24b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 25e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#if !defined(_MSC_VER) 26b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#define UNROLL _Pragma("unroll") 277828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL _Pragma("nounroll") 28e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#else 29ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan#define UNROLL 307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower#define NOUNROLL 31e2d51a87f0727f8537b46048d8241aeebb6e48d6Xiaoqiang Zheng#endif 32b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 33b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chennamespace tensorflow { 34b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowerusing Eigen::GpuDevice; 36b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 37ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass 38ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NHWC format. 397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 407828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 413c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2) 423c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNHWC(const DepthwiseArgs args, const T* input, 433c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* output, int num_outputs) { 44b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int in_rows = args.in_rows; 45b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int in_cols = args.in_cols; 46b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int in_depth = args.in_depth; 477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 517828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 53b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int stride = args.stride; 54b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int pad_rows = args.pad_rows; 55b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int pad_cols = args.pad_cols; 56b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int out_rows = args.out_rows; 57b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int out_cols = args.out_cols; 58b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int out_depth = args.out_depth; 59b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 60b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { 61b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen // Compute the indexes of this thread in the output. 62b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int OD = thread_id % out_depth; 63b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int OC = (thread_id / out_depth) % out_cols; 64b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int OR = (thread_id / out_depth / out_cols) % out_rows; 65b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int OB = thread_id / out_depth / out_cols / out_rows; 66b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen // Compute the input depth and the index of depth multiplier. 67b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int in_d = OD / depth_multiplier; 68b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int multiplier = OD % depth_multiplier; 69b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 70ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 71ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 72b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int input_row_start = OR * stride - pad_rows; 73b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int input_col_start = OC * stride - pad_cols; 74b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int input_row_end = input_row_start + filter_rows; 75b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen const int input_col_end = input_col_start + filter_cols; 76b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T sum = 0; 785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset_temp = in_rows * OB; 80b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen if (input_row_start >= 0 && input_col_start >= 0 && 81b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen input_row_end < in_rows && input_col_end < in_cols) { 82b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 835f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r = input_row_start + f_r; 845f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset_temp = filter_cols * f_r; 85b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = input_col_start + f_c; 875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp)); 905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset = 915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen multiplier + 925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp)); 935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen sum += ldg(input + input_offset) * ldg(filter + filter_offset); 94b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 95b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 96b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } else { 97b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 985f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r = input_row_start + f_r; 995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset_temp = filter_cols * f_r; 100b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 1015f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = input_col_start + f_c; 102b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) { 1035f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = input_col_start + f_c; 1045f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 1055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 1065f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_d + in_depth * (in_c + in_cols * (in_r + input_offset_temp)); 1075f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int filter_offset = 108ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan multiplier + depth_multiplier * 109ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d + in_depth * (f_c + filter_offset_temp)); 1105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen sum += ldg(input + input_offset) * ldg(filter + filter_offset); 111b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 112b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 113b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 114b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 1155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen output[thread_id] = sum; 116b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 117b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen} 118ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 119ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution forward pass 120ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// in NCHW format. 1217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 1227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 1233c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(1024, 2) 1243c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNCHW(const DepthwiseArgs args, const T* input, 1253c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* output, int num_outputs) { 126ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_rows = args.in_rows; 127ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_cols = args.in_cols; 128ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 1297828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 1307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 1317828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 1327828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 1337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 1347828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 135ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 136ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_rows = args.pad_rows; 137ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_cols = args.pad_cols; 138ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_rows = args.out_rows; 139ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_cols = args.out_cols; 140ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 141ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 142ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { 143ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the output. 144ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 145ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We want coalesced reads so we make sure that each warp reads 146ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // a contiguous chunk of memory. 147ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 148ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // THIS IS PROBABLY WRONG, we are not doing coalesced reads 149ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // into the input, because of the depth multiplier division... 150ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int OC = thread_id % out_cols; 151ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int OR = (thread_id / out_cols) % out_rows; 152ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int OD = (thread_id / out_cols / out_rows) % out_depth; 153ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int OB = thread_id / out_cols / out_rows / out_depth; 154ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 155ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the input depth and the index of depth multiplier 156ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // based off the output depth index that this thread is 157ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // computing n. 158ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_d = OD / depth_multiplier; 159ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int multiplier = OD % depth_multiplier; 160ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 161ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Data is stored in the following format (let's assume we 162ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // flatten the height and width into one contiguous dimension 163ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // called "P". 164ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 165ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C1P1 B1C1P2 ..... B1C2P1 B1C2P2 .... 166ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C1P1 B2C1P2 ..... B2C2P1 B2C2P2 .... 167ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 168ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Each row contains in_depth * in_rows * in_cols values 169ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each sample in the batch. 170ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 171ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We can further flatten it into: 172ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 173ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C1P1 B1C1P2 ..... 174ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B1C2P1 B1C2P2 .... 175ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C1P1 B2C1P2 ..... 176ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // B2C2P1 B2C2P2 .... 177ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 178ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // where each row is a contiguous array of all of the spatial 179ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // pixels for a given batch and input depth. The following 180ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // loop unrolls across the filter dimensions for a given thread, 181ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // indexing into the filter value and the corresponding input 182ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // patch. 183ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 184ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We can compute the index into the patch once right here. 185ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset_temp = (OB * in_depth + in_d) * (in_rows * in_cols); 186ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 187ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Finally, we can iterate over the spatial dimensions and perform the 188ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // convolution, writing into the output at the end. 189ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 190ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // We perform an additional optimization, where we can determine 191ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // whether the patch fits within the image indices statically, and 192ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // avoid boundary checking within the loop. 193ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_row_start = OR * stride - pad_rows; 194ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_col_start = OC * stride - pad_cols; 195ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_row_end = input_row_start + filter_rows; 196ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_col_end = input_col_start + filter_cols; 197ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 198ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T sum = 0; 199ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan if (input_row_start >= 0 && input_col_start >= 0 && 200ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan input_row_end < in_rows && input_col_end < in_cols) { 201ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Loop that doesn't need to check for boundary conditions. 202ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 203ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r = input_row_start + f_r; 204ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset_temp = filter_cols * f_r; 205ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 206ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = input_col_start + f_c; 207ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 208ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = 209ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (input_offset_temp) + (in_r * in_cols) + in_c; 210ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 211ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan multiplier + 212ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan depth_multiplier * (in_d + in_depth * (f_c + filter_offset_temp)); 213ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(input + input_offset) * ldg(filter + filter_offset); 214ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 215ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 216ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 217ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Loop that needs to check for boundary conditions. 218ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 219ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r = input_row_start + f_r; 220ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset_temp = filter_cols * f_r; 221ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 222ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = input_col_start + f_c; 223ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // TODO(vrv): the in_r check can be done outside of this loop; 224ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // benchmark both methods to determine the better decision. 225ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) { 226ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = input_col_start + f_c; 227ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 228ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // input_offset_temp indexes into the start of memory 229ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // where the spatial data starts. 230ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = 231ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (input_offset_temp) + (in_r * in_cols) + in_c; 232ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 233ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 234ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan multiplier + depth_multiplier * 235ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d + in_depth * (f_c + filter_offset_temp)); 236ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(input + input_offset) * ldg(filter + filter_offset); 237ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 238ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 239ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 240ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 241ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 242ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan output[thread_id] = sum; 243ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 244ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 245ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 2467828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 2477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 2487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dGPU(const GpuDevice& d, const DepthwiseArgs args, 2497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* input, const T* filter, T* output, 2507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 2517828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int num_outputs = 2527828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.batch * args.out_rows * args.out_cols * args.out_depth; 2533c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower // The compile-time constant version runs faster with a single block. 2543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const int max_block_count = kKnownFilterWidth < 0 || kKnownFilterHeight < 0 || 2553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower kKnownDepthMultiplier < 0 || 2563c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower args.out_rows * args.out_cols <= 256 2573c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower ? std::numeric_limits<int>::max() 2583c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower : d.getNumCudaMultiProcessors(); 2597828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (data_format == FORMAT_NHWC) { 2603c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 2613c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_outputs, d, 2623c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight, 2633c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower kKnownDepthMultiplier>, 2643c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 2657828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dGPUKernelNHWC<T, kKnownFilterWidth, kKnownFilterHeight, 2667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier> 2673c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower <<<std::min(max_block_count, config.block_count), 2683c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower config.thread_per_block, 0, d.stream()>>>(args, input, filter, 2693c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower output, num_outputs); 2707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else if (data_format == FORMAT_NCHW) { 2713c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 2723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_outputs, d, 2733c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight, 2743c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower kKnownDepthMultiplier>, 2753c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 2767828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight, 2777828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier> 2783c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower <<<std::min(max_block_count, config.block_count), 2793c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower config.thread_per_block, 0, d.stream()>>>(args, input, filter, 2803c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower output, num_outputs); 2817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else { 2827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower assert(false); 2837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } 2847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 285b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 286b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 287b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate <typename T> 288b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chenstruct DepthwiseConv2dGPULaunch { 2897828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower static void Run(const GpuDevice& d, const DepthwiseArgs args, const T* input, 290ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* filter, T* output, TensorFormat data_format) { 2917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3 && 2927828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.depth_multiplier == 1) { 2937828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, 3, 3, 1>(d, args, input, filter, output, 2947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower data_format); 295ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 2967828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dGPU<T, -1, -1, -1>(d, args, input, filter, output, 2977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower data_format); 298ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 299b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen } 300b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen}; 301b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 302b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<float>; 303b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chentemplate struct DepthwiseConv2dGPULaunch<double>; 304b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen 3055f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input. 3067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 3077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 3083c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 3093c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNHWC(const DepthwiseArgs args, 3103c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 3113c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* filter, T* in_backprop, 3123c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_in_backprop) { 3135f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_rows = args.in_rows; 3145f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_cols = args.in_cols; 3155f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_depth = args.in_depth; 3167828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 3177828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 3187828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 3197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 3207828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 3217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 3225f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int stride = args.stride; 3235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int pad_rows = args.pad_rows; 3245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int pad_cols = args.pad_cols; 3255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_rows = args.out_rows; 3265f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_cols = args.out_cols; 3275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_depth = args.out_depth; 3285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 3295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { 3305f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the indexes of this thread in the output. 3315f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_d = thread_id % in_depth; 3325f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = (thread_id / in_depth) % in_cols; 3335f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r = (thread_id / in_depth / in_cols) % in_rows; 3345f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int b = thread_id / in_depth / in_cols / in_rows; 3355f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 3365f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T sum = 0; 3375f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 3385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_r_start = 3395f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride); 3405f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride); 3415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_c_start = 3425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride); 3435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride); 3445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 3457828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower NOUNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) { 3462f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int f_r = in_r + pad_rows - out_r * stride; 3472f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int temp_out_backprop_offset = 3482f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower out_depth * out_cols * (out_r + out_rows * b); 3492f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int temp_filter_offset = filter_cols * f_r; 3507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower NOUNROLL for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) { 3512f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int f_c = in_c + pad_cols - out_c * stride; 3522f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower int filter_offset = 3532f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower depth_multiplier * (in_d + in_depth * (f_c + temp_filter_offset)); 3542f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower const int out_backprop_offset = 3552f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower out_depth * out_c + temp_out_backprop_offset; 356b286574da19e18371e759fe6b676bb07728ef9acA. Unique TensorFlower#pragma unroll 6 3572f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower for (int i = 0; i < depth_multiplier; ++i) { 3582f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower sum += ldg(out_backprop + out_backprop_offset + 3592f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower in_d * depth_multiplier + i) * 3602f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower ldg(filter + filter_offset + i); 3615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 3625f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 3635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 3645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_backprop_offset = 3655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_d + in_depth * (in_c + in_cols * (in_r + in_rows * b)); 3665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_backprop[in_backprop_offset] = sum; 3675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 3685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen} 3695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 3707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 3717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 3723c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 373ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan DepthwiseConv2dBackpropInputGPUKernelNCHW(const DepthwiseArgs args, 374ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* out_backprop, 375ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* filter, T* in_backprop, 376ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan int num_in_backprop) { 377ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_rows = args.in_rows; 378ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_cols = args.in_cols; 379ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 3807828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 3817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 3827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 3837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 3847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 3857828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 386ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 387ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_rows = args.pad_rows; 388ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_cols = args.pad_cols; 389ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_rows = args.out_rows; 390ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_cols = args.out_cols; 391ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 392ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 393ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // TODO(vrv): Consider assigning threads to output and using 394ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // atomics for accumulation, similar to the filter case. 395ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { 396ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the input. 397ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = thread_id % in_cols; 398ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r = (thread_id / in_cols) % in_rows; 399ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_d = (thread_id / in_cols / in_rows) % in_depth; 400ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int b = thread_id / in_depth / in_cols / in_rows; 401ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 402ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T sum = 0; 403ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_d_start = in_d * depth_multiplier; 404ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_d_end = out_d_start + depth_multiplier; 405ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 406ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_r_start = 407ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan tf_max<int>(0, (in_r - filter_rows + pad_rows + stride) / stride); 408ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_r_end = tf_min(out_rows - 1, (in_r + pad_rows) / stride); 409ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_c_start = 410ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan tf_max(0, (in_c - filter_cols + pad_cols + stride) / stride); 411ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_c_end = tf_min(out_cols - 1, (in_c + pad_cols) / stride); 412ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 413ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int out_d = out_d_start; out_d < out_d_end; ++out_d) { 414ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) { 415ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int f_r = in_r + pad_rows - out_r * stride; 416ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_dm = out_d - out_d_start; 417ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 418ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int temp_filter_offset = filter_cols * f_r; 419ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) { 420ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int f_c = in_c + pad_cols - out_c * stride; 421ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int filter_offset = 422ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan filter_dm + args.depth_multiplier * 423ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d + in_depth * (f_c + temp_filter_offset)); 424ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 425ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_backprop_offset = 426ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (b * out_depth * out_rows * out_cols) + 427ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (out_d * out_rows * out_cols) + (out_r * out_cols) + (out_c); 428ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 429ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan sum += ldg(out_backprop + out_backprop_offset) * 430ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan ldg(filter + filter_offset); 431ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 432ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 433ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 434ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_backprop_offset = (b * in_rows * in_cols * in_depth) + 435ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d * in_rows * in_cols) + 436ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_r * in_cols) + (in_c); 437ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan in_backprop[in_backprop_offset] = sum; 438ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 439ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 440ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 4417828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 4427828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 4437828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& d, 4447828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const DepthwiseArgs args, 4457828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* out_backprop, 4467828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* filter, T* in_backprop, 4477828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 4487828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int num_in_backprop = 4497828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.batch * args.in_rows * args.in_cols * args.in_depth; 4507828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (data_format == FORMAT_NHWC) { 4513c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 4523c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_in_backprop, d, 4533c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNHWC< 4543c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>, 4553c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 4567828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNHWC< 4577828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier> 4587828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 4597828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args, out_backprop, filter, in_backprop, num_in_backprop); 4607828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else if (data_format == FORMAT_NCHW) { 4613c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 4623c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_in_backprop, d, 4633c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNCHW< 4643c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>, 4653c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 4667828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dBackpropInputGPUKernelNCHW< 4677828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier> 4687828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 4697828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args, out_backprop, filter, in_backprop, num_in_backprop); 4707828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else { 4717828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower assert(false); 4727828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } 4737828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 4747828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower 4755f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 4765f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T> 4775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropInputGPULaunch { 4787828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower static void Run(const GpuDevice& d, const DepthwiseArgs args, 479ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* out_backprop, const T* filter, T* in_backprop, 480ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan TensorFormat data_format) { 4817828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (args.depth_multiplier == 1) { 4827828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3) { 4837828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, 3, 3, 1>( 4847828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower d, args, out_backprop, filter, in_backprop, data_format); 485ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 4867828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, 1>( 4877828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower d, args, out_backprop, filter, in_backprop, data_format); 488ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 4892f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower } else { 4907828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dBackpropInputGPU<T, -1, -1, -1>( 4917828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower d, args, out_backprop, filter, in_backprop, data_format); 4922f9c1d2d205e1b7be111dd87a26d7c3a4d57c6c1A. Unique TensorFlower } 4935f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 4945f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}; 4955f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 4965f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<float>; 4975f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropInputGPULaunch<double>; 4985f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 4995f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. 5007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 5017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 5023c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 5033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNHWC(const DepthwiseArgs args, 5043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 5053c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* input, 5063c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T* filter_backprop, 5073c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_out_backprop) { 5085f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_rows = args.in_rows; 5095f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_cols = args.in_cols; 5105f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_depth = args.in_depth; 5117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 5127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 5137828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 5147828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 5157828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 5167828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 5175f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int stride = args.stride; 5185f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int pad_rows = args.pad_rows; 5195f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int pad_cols = args.pad_cols; 5205f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_rows = args.out_rows; 5215f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_cols = args.out_cols; 5225f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_depth = args.out_depth; 5235f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 5245f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) { 5255f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the indexes of this thread in the output. 5265f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_d = thread_id % out_depth; 5275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_c = (thread_id / out_depth) % out_cols; 5285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_r = (thread_id / out_depth / out_cols) % out_rows; 5295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int b = thread_id / out_depth / out_cols / out_rows; 5305f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Compute the input depth and the index of depth multiplier. 5315f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_d = out_d / depth_multiplier; 5325f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int dm = out_d % depth_multiplier; 5335f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 534ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 535ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 5365f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r_start = out_r * stride - pad_rows; 5375f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c_start = out_c * stride - pad_cols; 5385f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r_end = in_r_start + filter_rows; 5395f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c_end = in_c_start + filter_cols; 5405f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 5415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int out_backprop_offset = 5425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen out_d + out_depth * (out_c + out_cols * (out_r + out_rows * b)); 5435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const T out_bp = ldg(out_backprop + out_backprop_offset); 5445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows && 5455f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_c_end < in_cols) { 5465f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 5475f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r = in_r_start + f_r; 5485f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Avoid repeated computation. 5495f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset_temp = in_cols * (in_r + in_rows * b); 5505f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 5515f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = in_c_start + f_c; 5525f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 5535f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = in_d + in_depth * (in_c + input_offset_temp); 5545f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T partial_sum = ldg(input + input_offset) * out_bp; 5555f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T* addr = filter_backprop + 556ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (dm + depth_multiplier * 557ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d + in_depth * (f_c + filter_cols * f_r))); 5585f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CudaAtomicAdd(addr, partial_sum); 5595f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 5605f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 5615f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } else { 5625f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 5635f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_r = in_r_start + f_r; 5645f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Avoid repeated computation. 5655f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset_temp = in_cols * (in_r + in_rows * b); 5665f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 5675f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int in_c = in_c_start + f_c; 5685f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int addr_temp = filter_cols * f_r; 5695f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 5705f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) { 5715f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen const int input_offset = 5725f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen in_d + in_depth * (in_c + input_offset_temp); 5735f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T partial_sum = ldg(input + input_offset) * out_bp; 5745f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen T* addr = 5755f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen filter_backprop + 5765f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp))); 5775f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // Potentially many threads can add to the same address so we have 5785f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // to use atomic add here. 5795f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // TODO(jmchen): If atomic add turns out to be slow, we can: 5805f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen // 1. allocate multiple buffers for the gradients (one for each 581ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // example in a batch, for example). This can reduce the 582ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // contention on the destination; 2. Have each thread compute one 583ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // gradient for an element in the filters. This should work well 584ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // when the input depth is big and filter size is not too small. 585ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CudaAtomicAdd(addr, partial_sum); 586ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 587ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 588ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 589ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 590ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 591ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan} 592ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 593ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. 5947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 5957828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 5963c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower__global__ void __launch_bounds__(640, 2) 5973c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNCHW(const DepthwiseArgs args, 5983c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* out_backprop, 5993c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower const T* input, 6003c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T* filter_backprop, 6013c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower int num_out_backprop) { 602ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_rows = args.in_rows; 603ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_cols = args.in_cols; 604ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_depth = args.in_depth; 6057828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_rows = 6067828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterHeight < 0 ? args.filter_rows : kKnownFilterHeight; 6077828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int filter_cols = 6087828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownFilterWidth < 0 ? args.filter_cols : kKnownFilterWidth; 6097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int depth_multiplier = 6107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower kKnownDepthMultiplier < 0 ? args.depth_multiplier : kKnownDepthMultiplier; 611ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int stride = args.stride; 612ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_rows = args.pad_rows; 613ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int pad_cols = args.pad_cols; 614ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_rows = args.out_rows; 615ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_cols = args.out_cols; 616ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_depth = args.out_depth; 617ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 618ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) { 619ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the indexes of this thread in the output. 620ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_c = thread_id % out_cols; 621ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_r = (thread_id / out_cols) % out_rows; 622ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_d = (thread_id / out_cols / out_rows) % out_depth; 623ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 624ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int b = thread_id / out_depth / out_cols / out_rows; 625ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Compute the input depth and the index of depth multiplier. 626ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_d = out_d / depth_multiplier; 627ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int dm = out_d % depth_multiplier; 628ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 629ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Decide if all input is valid, if yes, we can skip the boundary checks 630ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // for each input. 631ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r_start = out_r * stride - pad_rows; 632ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c_start = out_c * stride - pad_cols; 633ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r_end = in_r_start + filter_rows; 634ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c_end = in_c_start + filter_cols; 635ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 636ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int out_backprop_offset = (b * out_depth * out_rows * out_cols) + 637ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (out_d * out_rows * out_cols) + 638ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (out_r * out_cols) + (out_c); 639ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 640ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T out_bp = ldg(out_backprop + out_backprop_offset); 641ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan if (in_r_start >= 0 && in_c_start >= 0 && in_r_end < in_rows && 642ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan in_c_end < in_cols) { 643ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 644ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r = in_r_start + f_r; 645ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Avoid repeated computation. 646ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset_temp = (b * in_depth * in_rows * in_cols) + 647ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d * in_rows * in_cols) + 648ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_r * in_cols); 649ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 650ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 651ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = in_c_start + f_c; 652ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = input_offset_temp + in_c; 653ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T partial_sum = ldg(input + input_offset) * out_bp; 654ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T* addr = filter_backprop + 655ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (dm + depth_multiplier * 656ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d + in_depth * (f_c + filter_cols * f_r))); 657ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan CudaAtomicAdd(addr, partial_sum); 658ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 659ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 660ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 661ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_r = 0; f_r < filter_rows; ++f_r) { 662ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_r = in_r_start + f_r; 663ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Avoid repeated computation. 664ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset_temp = (b * in_depth * in_rows * in_cols) + 665ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_d * in_rows * in_cols) + 666ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (in_r * in_cols); 667ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan UNROLL for (int f_c = 0; f_c < filter_cols; ++f_c) { 668ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int in_c = in_c_start + f_c; 669ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int addr_temp = filter_cols * f_r; 670ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan 671ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan if (in_r >= 0 && in_r < in_rows && in_c >= 0 && in_c < in_cols) { 672ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const int input_offset = input_offset_temp + in_c; 673ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T partial_sum = ldg(input + input_offset) * out_bp; 674ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan T* addr = 675ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan filter_backprop + 676ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan (dm + depth_multiplier * (in_d + in_depth * (f_c + addr_temp))); 677ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // Potentially many threads can add to the same address so we have 678ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // to use atomic add here. 679ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // TODO(jmchen): If atomic add turns out to be slow, we can: 680ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // 1. allocate multiple buffers for the gradients (one for each 681ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // example in a batch, for example). This can reduce the 682ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // contention on the destination; 2. Have each thread compute one 683ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // gradient for an element in the filters. This should work well 684ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan // when the input depth is big and filter size is not too small. 6855f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen CudaAtomicAdd(addr, partial_sum); 6865f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 6875f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 6885f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 6895f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 6905f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 6915f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen} 6925f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 6937828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowertemplate <typename T, int kKnownFilterWidth, int kKnownFilterHeight, 6947828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower int kKnownDepthMultiplier> 6957828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlowervoid LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& d, 6967828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const DepthwiseArgs args, 6977828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* out_backprop, 6987828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const T* input, T* filter_backprop, 6997828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower TensorFormat data_format) { 7007828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower const int num_out_backprop = 7017828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.batch * args.out_rows * args.out_cols * args.out_depth; 7027828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (data_format == FORMAT_NHWC) { 7033c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 7043c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_out_backprop, d, 7053c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNHWC< 7063c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>, 7073c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 7087828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNHWC< 7097828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier> 7107828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 7117828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args, out_backprop, input, filter_backprop, num_out_backprop); 7127828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else if (data_format == FORMAT_NCHW) { 7133c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower CudaLaunchConfig config = GetCudaLaunchConfig( 7143c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower num_out_backprop, d, 7153c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNCHW< 7163c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>, 7173c02d1100788789b04e04feb93761f0ad898ea77A. Unique TensorFlower 0); 7187828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower DepthwiseConv2dBackpropFilterGPUKernelNCHW< 7197828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier> 7207828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( 7217828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args, out_backprop, input, filter_backprop, num_out_backprop); 7227828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } else { 7237828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower assert(false); 7247828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower } 7257828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower} 7267828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower 7275f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen// A simple launch pad to launch the Cuda kernel for depthwise convolution. 7285f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate <typename T> 7295f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chenstruct DepthwiseConv2dBackpropFilterGPULaunch { 7307828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower static void Run(const GpuDevice& d, const DepthwiseArgs args, 731ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan const T* out_backprop, const T* input, T* filter_backprop, 732ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan TensorFormat data_format) { 7337828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower if (args.filter_rows == 3 && args.filter_cols == 3 && 7347828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower args.depth_multiplier == 1) { 7357828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, 3, 3, 1>( 7367828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower d, args, out_backprop, input, filter_backprop, data_format); 737ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } else { 7387828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower LaunchDepthwiseConv2dBackpropFilterGPU<T, -1, -1, -1>( 7397828637e07b0081a37dfdc66ff912dd1d6ff3228A. Unique TensorFlower d, args, out_backprop, input, filter_backprop, data_format); 740ce016c8726a9250be98337691090acb6655a0aceVijay Vasudevan } 7415f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen } 7425f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen}; 7435f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chen 7445f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<float>; 7455f7683ea100c06bba66536fd97b5c141f576e0d7Jianmin Chentemplate struct DepthwiseConv2dBackpropFilterGPULaunch<double>; 746b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen} // namespace tensorflow 747b51ef0cd06e1bfb529b272e55010790ff3ead31fJianmin Chen#endif // GOOGLE_CUDA 748