maxpooling_op.cc revision 200fa5fb0be1523f828014ae8f3ca28099cc942a
1/* Copyright 2015 Google Inc. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15 16// See docs in ../ops/nn_ops.cc. 17 18#define EIGEN_USE_THREADS 19 20#include "tensorflow/core/kernels/maxpooling_op.h" 21 22#include <vector> 23#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks" 24#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 25#include "tensorflow/core/common_runtime/device.h" 26#include "tensorflow/core/framework/numeric_op.h" 27#include "tensorflow/core/framework/op_kernel.h" 28#include "tensorflow/core/framework/tensor.h" 29#include "tensorflow/core/framework/tensor_shape.h" 30#include "tensorflow/core/framework/tensor_slice.h" 31#include "tensorflow/core/kernels/conv_2d.h" 32#include "tensorflow/core/kernels/ops_util.h" 33#include "tensorflow/core/kernels/pooling_ops_common.h" 34#include "tensorflow/core/lib/core/errors.h" 35#include "tensorflow/core/lib/gtl/array_slice.h" 36#include "tensorflow/core/util/padding.h" 37#include "tensorflow/core/util/tensor_format.h" 38#include "tensorflow/core/util/use_cudnn.h" 39 40#if GOOGLE_CUDA 41#include "tensorflow/core/kernels/maxpooling_op_gpu.h" 42#include "tensorflow/core/kernels/pooling_ops_common_gpu.h" 43#include "tensorflow/core/platform/stream_executor.h" 44#endif // GOOGLE_CUDA 45 46namespace tensorflow { 47 48typedef Eigen::ThreadPoolDevice CPUDevice; 49 50const int kInvalidMaxPoolingIndex = -1; 51 52template <typename Device, typename T> 53struct SpatialMaxPoolWithArgMaxHelper { 54 static void Compute(Tensor* output, Tensor* output_arg_max, 55 const Tensor& tensor_in, const PoolParameters& params, 56 const Padding& padding) { 57 typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 58 ConstEigenMatrixMap; 59 typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> 60 EigenMatrixMap; 61 typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>> 62 EigenIndexMatrixMap; 63 64 ConstEigenMatrixMap in_mat( 65 tensor_in.flat<T>().data(), params.depth, 66 params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch); 67 EigenMatrixMap out_mat( 68 output->flat<T>().data(), params.depth, 69 params.out_width * params.out_height * params.tensor_in_batch); 70 EigenIndexMatrixMap out_arg_max_mat( 71 output_arg_max->flat<int64>().data(), params.depth, 72 params.out_width * params.out_height * params.tensor_in_batch); 73 74 // Initializes the output tensor with MIN<T>. 75 output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex); 76 output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest()); 77 78 // The following code basically does the following: 79 // 1. Flattens the input and output tensors into two dimensional arrays. 80 // tensor_in_as_matrix: 81 // depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch) 82 // output_as_matrix: 83 // depth by (out_width * out_height * tensor_in_batch) 84 // 85 // 2. Walks through the set of columns in the flattened tensor_in_as_matrix, 86 // and updates the corresponding column(s) in output_as_matrix with the 87 // max value. 88 for (int b = 0; b < params.tensor_in_batch; ++b) { 89 for (int h = 0; h < params.tensor_in_rows; ++h) { 90 for (int w = 0; w < params.tensor_in_cols; ++w) { 91 // (h_start, h_end) * (w_start, w_end) is the range that the input 92 // vector projects to. 93 const int hpad = h + params.pad_rows; 94 const int wpad = w + params.pad_cols; 95 const int h_start = 96 (hpad < params.window_rows) 97 ? 0 98 : (hpad - params.window_rows) / params.row_stride + 1; 99 const int h_end = 100 std::min(hpad / params.row_stride + 1, params.out_height); 101 const int w_start = 102 (wpad < params.window_cols) 103 ? 0 104 : (wpad - params.window_cols) / params.col_stride + 1; 105 const int w_end = 106 std::min(wpad / params.col_stride + 1, params.out_width); 107 // compute elementwise max 108 const int in_index = 109 (b * params.tensor_in_rows + h) * params.tensor_in_cols + w; 110 for (int ph = h_start; ph < h_end; ++ph) { 111 for (int pw = w_start; pw < w_end; ++pw) { 112 const int out_index = 113 (b * params.out_height + ph) * params.out_width + pw; 114 /// NOTES(zhengxq): not using the eigen matrix operation for now. 115 /// May consider parallelizing the operations if needed. 116 for (int d = 0; d < params.depth; ++d) { 117 const T& input_ref = in_mat.coeffRef(d, in_index); 118 T& output_ref = out_mat.coeffRef(d, out_index); 119 int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index); 120 if (output_ref < input_ref || 121 out_arg_max_ref == kInvalidMaxPoolingIndex) { 122 output_ref = input_ref; 123 int input_offset = in_index * params.depth + d; 124 out_arg_max_ref = input_offset; 125 } 126 } 127 } 128 } 129 } 130 } 131 } 132 } 133}; 134 135REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU), 136 MaxPoolingOp<CPUDevice, float>); 137 138#if GOOGLE_CUDA 139// Forward declarations for the functor specializations for GPU. 140namespace functor { 141#define DECLARE_GPU_SPEC(T) \ 142 template <> \ 143 void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()( \ 144 const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \ 145 typename TTypes<T, 4>::ConstTensor input, int window_rows, \ 146 int window_cols, int row_stride, int col_stride, \ 147 const Eigen::PaddingType& padding); \ 148 extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>; 149 150DECLARE_GPU_SPEC(float); 151#undef DECLARE_GPU_SPEC 152} // namespace functor 153 154// Note(jiayq): Currently, the Caffe custom implementation is faster than the 155// default Eigen implementation so we are using the custom kernel as the 156// default. However, you can explicitly invoke the eigen version using 157// kernel_label_map. 158REGISTER_KERNEL_BUILDER(Name("MaxPool") 159 .Device(DEVICE_GPU) 160 .Label("eigen_tensor"), 161 MaxPoolingOp<Eigen::GpuDevice, float>); 162#endif // GOOGLE_CUDA 163 164// The operation to compute MaxPool gradients. 165// It takes three inputs: 166// - The original input tensor 167// - The original output tensor 168// - Backprop tensor for output 169// It produces one output: backprop tensor for input. 170template <class Device, class T> 171class MaxPoolingGradOp : public OpKernel { 172 public: 173 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { 174 string data_format; 175 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 176 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 177 errors::InvalidArgument("Invalid data format")); 178 OP_REQUIRES(context, data_format_ == FORMAT_NHWC, 179 errors::InvalidArgument( 180 "Default MaxPoolinGradgOp only supports NHWC.")); 181 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 182 OP_REQUIRES(context, ksize_.size() == 4, 183 errors::InvalidArgument("Sliding window ksize field must " 184 "specify 4 dimensions")); 185 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 186 OP_REQUIRES(context, stride_.size() == 4, 187 errors::InvalidArgument("Sliding window strides field must " 188 "specify 4 dimensions")); 189 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 190 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 191 errors::Unimplemented( 192 "Pooling is not yet supported on the batch dimension.")); 193 OP_REQUIRES( 194 context, ksize_[3] == 1 && stride_[3] == 1, 195 errors::Unimplemented( 196 "MaxPoolingGrad is not yet supported on the depth dimension.")); 197 } 198 199 void Compute(OpKernelContext* context) override { 200 const Tensor& tensor_in = context->input(0); 201 const Tensor& tensor_out = context->input(1); 202 const Tensor& out_backprop = context->input(2); 203 204 // For maxpooling, tensor_in should have 4 dimensions. 205 OP_REQUIRES(context, tensor_in.dims() == 4, 206 errors::InvalidArgument("tensor_in must be 4-dimensional")); 207 OP_REQUIRES(context, tensor_out.dims() == 4, 208 errors::InvalidArgument("tensor_out must be 4-dimensional")); 209 // For maxpooling, out_backprop should have 4 dimensions. 210 OP_REQUIRES(context, out_backprop.dims() == 4, 211 errors::InvalidArgument("out_backprop must be 4-dimensional")); 212 213 TensorShape output_shape = tensor_in.shape(); 214 215 // Tensor index_tensor(context->allocator(), DT_INT32, output_shape); 216 217 Tensor tensor_out_dup; 218 OP_REQUIRES_OK(context, 219 context->allocate_temp(DataTypeToEnum<T>::v(), 220 tensor_out.shape(), &tensor_out_dup)); 221 Tensor tensor_out_arg_max; 222 OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(), 223 tensor_out.shape(), 224 &tensor_out_arg_max)); 225 226 PoolParameters params{context, ksize_, stride_, 227 padding_, FORMAT_NHWC, tensor_in.shape()}; 228 if (!context->status().ok()) { 229 return; 230 } 231 232 Tensor* output = nullptr; 233 OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); 234 output->flat<T>().setZero(); 235 236 SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute( 237 &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_); 238 auto out_backprop_flat = out_backprop.flat<T>(); 239 auto input_backprop_flat = output->flat<T>(); 240 auto out_arg_max_flat = tensor_out_arg_max.flat<int64>(); 241 int num_total_outputs = out_backprop.flat<T>().size(); 242 int num_total_inputs = input_backprop_flat.size(); 243 244 for (int index = 0; index < num_total_outputs; ++index) { 245 int input_backprop_index = out_arg_max_flat(index); 246 // Although this check is in the inner loop, it is worth its value 247 // so we don't end up with memory corruptions. Our benchmark shows that 248 // the performance impact is quite small 249 CHECK(input_backprop_index >= 0 && 250 input_backprop_index < num_total_inputs) 251 << "Invalid input backprop index: " << input_backprop_index << ", " 252 << num_total_inputs; 253 input_backprop_flat(input_backprop_index) += out_backprop_flat(index); 254 } 255 } 256 257 private: 258 std::vector<int32> ksize_; 259 std::vector<int32> stride_; 260 Padding padding_; 261 TensorFormat data_format_; 262}; 263 264REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU), 265 MaxPoolingGradOp<CPUDevice, float>); 266 267#ifdef GOOGLE_CUDA 268 269static void MaxPoolingBackwardCustomKernel( 270 OpKernelContext* context, const std::vector<int32>& size, 271 const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in, 272 const Tensor& out_backprop, const TensorShape& tensor_in_shape) { 273 Tensor* output = nullptr; 274 275 OP_REQUIRES_OK(context, 276 context->allocate_output(0, tensor_in_shape, &output)); 277 278 PoolParameters params{context, size, stride, 279 padding, FORMAT_NHWC, tensor_in_shape}; 280 if (!context->status().ok()) { 281 return; 282 } 283 284 MaxPoolBackwardNoMask( 285 tensor_in->flat<float>().data(), params.tensor_in_batch, 286 params.tensor_in_rows, params.tensor_in_cols, params.depth, 287 params.out_height, params.out_width, params.window_rows, 288 params.window_cols, params.row_stride, params.col_stride, params.pad_rows, 289 params.pad_cols, out_backprop.flat<float>().data(), 290 output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>()); 291} 292 293template <class T> 294class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel { 295 public: 296 typedef Eigen::GpuDevice Device; 297 298 explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) { 299 string data_format; 300 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 301 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 302 errors::InvalidArgument("Invalid data format")); 303 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 304 OP_REQUIRES(context, ksize_.size() == 4, 305 errors::InvalidArgument("Sliding window ksize field must " 306 "specify 4 dimensions")); 307 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 308 OP_REQUIRES(context, stride_.size() == 4, 309 errors::InvalidArgument("Sliding window strides field must " 310 "specify 4 dimensions")); 311 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 312 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 313 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 314 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 315 errors::Unimplemented( 316 "Pooling is not yet supported on the batch dimension.")); 317 318 use_dnn_ = CanUseCudnn(); 319 } 320 321 void Compute(OpKernelContext* context) override { 322 const Tensor& tensor_in = context->input(0); 323 const Tensor& tensor_out = context->input(1); 324 const Tensor& out_backprop = context->input(2); 325 326 // For maxpooling, tensor_in should have 4 dimensions. 327 OP_REQUIRES(context, tensor_in.dims() == 4, 328 errors::InvalidArgument("tensor_in must be 4-dimensional 4")); 329 OP_REQUIRES(context, tensor_out.dims() == 4, 330 errors::InvalidArgument("tensor_out must be 4-dimensional")); 331 // For maxpooling, out_backprop should have 4 dimensions. 332 OP_REQUIRES(context, out_backprop.dims() == 4, 333 errors::InvalidArgument("out_backprop must be 4-dimensional")); 334 335 TensorShape output_shape = tensor_in.shape(); 336 337 if (use_dnn_) { 338 DnnPoolingGradOp<T>::Compute( 339 context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_, 340 stride_, padding_, data_format_, &tensor_in, &tensor_out, 341 out_backprop, output_shape); 342 } else { 343 CHECK(data_format_ == FORMAT_NHWC) 344 << "Non-Cudnn MaxPoolGrad only supports NHWC format"; 345 MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_, 346 &tensor_in, out_backprop, output_shape); 347 } 348 } 349 350 private: 351 std::vector<int32> ksize_; 352 std::vector<int32> stride_; 353 Padding padding_; 354 TensorFormat data_format_; 355 bool use_dnn_; 356}; 357 358REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU), 359 MaxPoolingGradOp<Eigen::GpuDevice, float>); 360 361#endif // GOOGLE_CUDA 362 363template <typename Device, typename T> 364struct LaunchMaxPoolingNoMask; 365 366template <typename Device, typename T> 367class MaxPoolingNoMaskOp : public OpKernel { 368 public: 369 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) 370 : OpKernel(context) { 371 string data_format; 372 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 373 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 374 errors::InvalidArgument("Invalid data format")); 375 OP_REQUIRES(context, data_format_ == FORMAT_NHWC, 376 errors::InvalidArgument( 377 "Default MaxPoolingNoMaskOp only supports NHWC.")); 378 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 379 OP_REQUIRES(context, ksize_.size() == 4, 380 errors::InvalidArgument("Sliding window ksize field must " 381 "specify 4 dimensions")); 382 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 383 OP_REQUIRES(context, stride_.size() == 4, 384 errors::InvalidArgument("Sliding window stride field must " 385 "specify 4 dimensions")); 386 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 387 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 388 errors::Unimplemented( 389 "Pooling is not yet supported on the batch dimension.")); 390 } 391 392 void Compute(OpKernelContext* context) override { 393 const Tensor& tensor_in = context->input(0); 394 395 PoolParameters params{context, ksize_, stride_, 396 padding_, data_format_, tensor_in.shape()}; 397 if (!context->status().ok()) { 398 return; 399 } 400 401 TensorShape out_shape({params.tensor_in_batch, params.out_height, 402 params.out_width, params.depth}); 403 Tensor* output = nullptr; 404 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 405 406 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 407 output); 408 } 409 410 private: 411 std::vector<int32> ksize_; 412 std::vector<int32> stride_; 413 Padding padding_; 414 TensorFormat data_format_; 415}; 416 417template <typename Device, typename T> 418struct LaunchMaxPoolingWithArgmax; 419 420template <typename Device, typename T> 421class MaxPoolingWithArgmaxOp : public OpKernel { 422 public: 423 explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context) 424 : OpKernel(context) { 425 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 426 OP_REQUIRES(context, ksize_.size() == 4, 427 errors::InvalidArgument("Sliding window ksize field must " 428 "specify 4 dimensions")); 429 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 430 OP_REQUIRES(context, stride_.size() == 4, 431 errors::InvalidArgument("Sliding window stride field must " 432 "specify 4 dimensions")); 433 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 434 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 435 errors::Unimplemented( 436 "Pooling is not yet supported on the batch dimension.")); 437 } 438 439 void Compute(OpKernelContext* context) override { 440 const Tensor& tensor_in = context->input(0); 441 442 PoolParameters params{context, ksize_, stride_, 443 padding_, FORMAT_NHWC, tensor_in.shape()}; 444 if (!context->status().ok()) { 445 return; 446 } 447 448 TensorShape out_shape({params.tensor_in_batch, params.out_height, 449 params.out_width, params.depth}); 450 Tensor* output = nullptr; 451 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 452 Tensor* argmax = nullptr; 453 OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax)); 454 455 LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in, 456 output, argmax); 457 } 458 459 private: 460 std::vector<int32> ksize_; 461 std::vector<int32> stride_; 462 Padding padding_; 463}; 464 465template <typename Device, typename T> 466struct LaunchMaxPoolingGradWithArgmax; 467 468template <typename Device, typename T> 469class MaxPoolingGradWithArgmaxOp : public OpKernel { 470 public: 471 explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context) 472 : OpKernel(context) { 473 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 474 OP_REQUIRES(context, ksize_.size() == 4, 475 errors::InvalidArgument("Sliding window ksize field must " 476 "specify 4 dimensions")); 477 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 478 OP_REQUIRES(context, stride_.size() == 4, 479 errors::InvalidArgument("Sliding window stride field must " 480 "specify 4 dimensions")); 481 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 482 OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1, 483 errors::Unimplemented( 484 "Pooling is not yet supported on the batch dimension.")); 485 } 486 487 void Compute(OpKernelContext* context) override { 488 const Tensor& tensor_in = context->input(0); 489 const Tensor& grad_in = context->input(1); 490 const Tensor& argmax = context->input(2); 491 492 PoolParameters params{context, ksize_, stride_, 493 padding_, FORMAT_NHWC, tensor_in.shape()}; 494 if (!context->status().ok()) { 495 return; 496 } 497 498 TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows, 499 params.tensor_in_cols, params.depth}); 500 Tensor* grad_out = nullptr; 501 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out)); 502 503 LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in, 504 argmax, grad_out); 505 } 506 507 private: 508 std::vector<int32> ksize_; 509 std::vector<int32> stride_; 510 Padding padding_; 511}; 512 513#if GOOGLE_CUDA 514template <typename T> 515class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel { 516 public: 517 typedef GPUDevice Device; 518 explicit MaxPoolingNoMaskOp(OpKernelConstruction* context) 519 : OpKernel(context) { 520 string data_format; 521 OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 522 OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 523 errors::InvalidArgument("Invalid data format")); 524 OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_)); 525 OP_REQUIRES(context, ksize_.size() == 4, 526 errors::InvalidArgument("Sliding window ksize field must " 527 "specify 4 dimensions")); 528 OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_)); 529 OP_REQUIRES(context, stride_.size() == 4, 530 errors::InvalidArgument("Sliding window stride field must " 531 "specify 4 dimensions")); 532 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 533 const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N'); 534 const int32 stride_n = GetTensorDim(stride_, data_format_, 'N'); 535 OP_REQUIRES(context, ksize_n == 1 && stride_n == 1, 536 errors::Unimplemented( 537 "Pooling is not yet supported on the batch dimension.")); 538 use_dnn_ = CanUseCudnn(); 539 } 540 541 void Compute(OpKernelContext* context) override { 542 const Tensor& tensor_in = context->input(0); 543 544 PoolParameters params{context, ksize_, stride_, 545 padding_, data_format_, tensor_in.shape()}; 546 if (!context->status().ok()) { 547 return; 548 } 549 550 TensorShape out_shape = 551 ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height, 552 params.out_width, params.depth); 553 if (use_dnn_ && data_format_ == FORMAT_NCHW) { 554 DnnPoolingOp<T>::Compute( 555 context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_, 556 stride_, padding_, data_format_, tensor_in, out_shape); 557 } else { 558 CHECK(data_format_ == FORMAT_NHWC) 559 << "Non-Cudnn MaxPool only supports NHWC format"; 560 Tensor* output = nullptr; 561 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 562 LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in, 563 output); 564 } 565 } 566 567 private: 568 std::vector<int32> ksize_; 569 std::vector<int32> stride_; 570 Padding padding_; 571 TensorFormat data_format_; 572 bool use_dnn_; 573}; 574 575template <typename T> 576struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> { 577 static void launch(OpKernelContext* context, const PoolParameters& params, 578 const Tensor& input, Tensor* output) { 579 bool status = MaxPoolForwardWithOptionalArgmax( 580 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, 581 params.tensor_in_cols, params.depth, params.out_height, 582 params.out_width, params.window_rows, params.window_cols, 583 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, 584 output->flat<T>().data(), nullptr, context->eigen_gpu_device()); 585 if (!status) { 586 context->SetStatus( 587 errors::Internal("Failed launching MaxPoolForwardNoMask")); 588 } 589 } 590}; 591 592REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU), 593 MaxPoolingNoMaskOp<Eigen::GpuDevice, float>); 594 595template <typename T> 596struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> { 597 static void launch(OpKernelContext* context, const PoolParameters& params, 598 const Tensor& input, Tensor* output, Tensor* argmax) { 599 bool status = MaxPoolForwardWithOptionalArgmax( 600 input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows, 601 params.tensor_in_cols, params.depth, params.out_height, 602 params.out_width, params.window_rows, params.window_cols, 603 params.row_stride, params.col_stride, params.pad_rows, params.pad_cols, 604 output->flat<T>().data(), 605 reinterpret_cast<int64*>(argmax->flat<int64>().data()), 606 context->eigen_gpu_device()); 607 if (!status) { 608 context->SetStatus( 609 errors::Internal("Failed launching MaxPoolForwardWithArgmax")); 610 } 611 } 612}; 613 614REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax") 615 .Device(DEVICE_GPU) 616 .TypeConstraint<int64>("Targmax"), 617 MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>); 618 619template <typename T> 620struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> { 621 static void launch(OpKernelContext* context, const PoolParameters& params, 622 const Tensor& grad_in, const Tensor& argmax, 623 Tensor* grad_out) { 624 const int input_size = params.tensor_in_batch * params.tensor_in_rows * 625 params.tensor_in_cols * params.depth; 626 const int output_size = params.tensor_in_batch * params.out_height * 627 params.out_width * params.depth; 628 const int top_offset = params.out_height * params.out_width * params.depth; 629 const int bottom_offset = 630 params.tensor_in_rows * params.tensor_in_cols * params.depth; 631 bool status = MaxPoolBackwardWithArgmax( 632 output_size, input_size, grad_in.flat<T>().data(), 633 reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset, 634 bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device()); 635 if (!status) { 636 context->SetStatus( 637 errors::Internal("Failed launching MaxPoolForwardWithArgmax")); 638 } 639 } 640}; 641 642REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax") 643 .Device(DEVICE_GPU) 644 .TypeConstraint<int64>("Targmax"), 645 MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>); 646 647#endif // GOOGLE_CUDA 648 649} // namespace tensorflow 650