maxpooling_op.cc revision 200fa5fb0be1523f828014ae8f3ca28099cc942a
1/* Copyright 2015 Google Inc. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// See docs in ../ops/nn_ops.cc.
17
18#define EIGEN_USE_THREADS
19
20#include "tensorflow/core/kernels/maxpooling_op.h"
21
22#include <vector>
23#include "third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks"
24#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
25#include "tensorflow/core/common_runtime/device.h"
26#include "tensorflow/core/framework/numeric_op.h"
27#include "tensorflow/core/framework/op_kernel.h"
28#include "tensorflow/core/framework/tensor.h"
29#include "tensorflow/core/framework/tensor_shape.h"
30#include "tensorflow/core/framework/tensor_slice.h"
31#include "tensorflow/core/kernels/conv_2d.h"
32#include "tensorflow/core/kernels/ops_util.h"
33#include "tensorflow/core/kernels/pooling_ops_common.h"
34#include "tensorflow/core/lib/core/errors.h"
35#include "tensorflow/core/lib/gtl/array_slice.h"
36#include "tensorflow/core/util/padding.h"
37#include "tensorflow/core/util/tensor_format.h"
38#include "tensorflow/core/util/use_cudnn.h"
39
40#if GOOGLE_CUDA
41#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
42#include "tensorflow/core/kernels/pooling_ops_common_gpu.h"
43#include "tensorflow/core/platform/stream_executor.h"
44#endif  // GOOGLE_CUDA
45
46namespace tensorflow {
47
48typedef Eigen::ThreadPoolDevice CPUDevice;
49
50const int kInvalidMaxPoolingIndex = -1;
51
52template <typename Device, typename T>
53struct SpatialMaxPoolWithArgMaxHelper {
54  static void Compute(Tensor* output, Tensor* output_arg_max,
55                      const Tensor& tensor_in, const PoolParameters& params,
56                      const Padding& padding) {
57    typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
58        ConstEigenMatrixMap;
59    typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
60        EigenMatrixMap;
61    typedef Eigen::Map<Eigen::Matrix<int64, Eigen::Dynamic, Eigen::Dynamic>>
62        EigenIndexMatrixMap;
63
64    ConstEigenMatrixMap in_mat(
65        tensor_in.flat<T>().data(), params.depth,
66        params.tensor_in_cols * params.tensor_in_rows * params.tensor_in_batch);
67    EigenMatrixMap out_mat(
68        output->flat<T>().data(), params.depth,
69        params.out_width * params.out_height * params.tensor_in_batch);
70    EigenIndexMatrixMap out_arg_max_mat(
71        output_arg_max->flat<int64>().data(), params.depth,
72        params.out_width * params.out_height * params.tensor_in_batch);
73
74    // Initializes the output tensor with MIN<T>.
75    output_arg_max->flat<int64>().setConstant(kInvalidMaxPoolingIndex);
76    output->flat<T>().setConstant(Eigen::NumTraits<T>::lowest());
77
78    // The following code basically does the following:
79    // 1. Flattens the input and output tensors into two dimensional arrays.
80    //    tensor_in_as_matrix:
81    //      depth by (tensor_in_cols * tensor_in_rows * tensor_in_batch)
82    //    output_as_matrix:
83    //      depth by (out_width * out_height * tensor_in_batch)
84    //
85    // 2. Walks through the set of columns in the flattened tensor_in_as_matrix,
86    //    and updates the corresponding column(s) in output_as_matrix with the
87    //    max value.
88    for (int b = 0; b < params.tensor_in_batch; ++b) {
89      for (int h = 0; h < params.tensor_in_rows; ++h) {
90        for (int w = 0; w < params.tensor_in_cols; ++w) {
91          // (h_start, h_end) * (w_start, w_end) is the range that the input
92          // vector projects to.
93          const int hpad = h + params.pad_rows;
94          const int wpad = w + params.pad_cols;
95          const int h_start =
96              (hpad < params.window_rows)
97                  ? 0
98                  : (hpad - params.window_rows) / params.row_stride + 1;
99          const int h_end =
100              std::min(hpad / params.row_stride + 1, params.out_height);
101          const int w_start =
102              (wpad < params.window_cols)
103                  ? 0
104                  : (wpad - params.window_cols) / params.col_stride + 1;
105          const int w_end =
106              std::min(wpad / params.col_stride + 1, params.out_width);
107          // compute elementwise max
108          const int in_index =
109              (b * params.tensor_in_rows + h) * params.tensor_in_cols + w;
110          for (int ph = h_start; ph < h_end; ++ph) {
111            for (int pw = w_start; pw < w_end; ++pw) {
112              const int out_index =
113                  (b * params.out_height + ph) * params.out_width + pw;
114              /// NOTES(zhengxq): not using the eigen matrix operation for now.
115              /// May consider parallelizing the operations if needed.
116              for (int d = 0; d < params.depth; ++d) {
117                const T& input_ref = in_mat.coeffRef(d, in_index);
118                T& output_ref = out_mat.coeffRef(d, out_index);
119                int64& out_arg_max_ref = out_arg_max_mat.coeffRef(d, out_index);
120                if (output_ref < input_ref ||
121                    out_arg_max_ref == kInvalidMaxPoolingIndex) {
122                  output_ref = input_ref;
123                  int input_offset = in_index * params.depth + d;
124                  out_arg_max_ref = input_offset;
125                }
126              }
127            }
128          }
129        }
130      }
131    }
132  }
133};
134
135REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_CPU),
136                        MaxPoolingOp<CPUDevice, float>);
137
138#if GOOGLE_CUDA
139// Forward declarations for the functor specializations for GPU.
140namespace functor {
141#define DECLARE_GPU_SPEC(T)                                            \
142  template <>                                                          \
143  void SpatialMaxPooling<Eigen::GpuDevice, T>::operator()(             \
144      const Eigen::GpuDevice& d, typename TTypes<T, 4>::Tensor output, \
145      typename TTypes<T, 4>::ConstTensor input, int window_rows,       \
146      int window_cols, int row_stride, int col_stride,                 \
147      const Eigen::PaddingType& padding);                              \
148  extern template struct SpatialMaxPooling<Eigen::GpuDevice, T>;
149
150DECLARE_GPU_SPEC(float);
151#undef DECLARE_GPU_SPEC
152}  // namespace functor
153
154// Note(jiayq): Currently, the Caffe custom implementation is faster than the
155// default Eigen implementation so we are using the custom kernel as the
156// default. However, you can explicitly invoke the eigen version using
157// kernel_label_map.
158REGISTER_KERNEL_BUILDER(Name("MaxPool")
159                            .Device(DEVICE_GPU)
160                            .Label("eigen_tensor"),
161                        MaxPoolingOp<Eigen::GpuDevice, float>);
162#endif  // GOOGLE_CUDA
163
164// The operation to compute MaxPool gradients.
165// It takes three inputs:
166//   - The original input tensor
167//   - The original output tensor
168//   - Backprop tensor for output
169// It produces one output: backprop tensor for input.
170template <class Device, class T>
171class MaxPoolingGradOp : public OpKernel {
172 public:
173  explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
174    string data_format;
175    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
176    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
177                errors::InvalidArgument("Invalid data format"));
178    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
179                errors::InvalidArgument(
180                    "Default MaxPoolinGradgOp only supports NHWC."));
181    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
182    OP_REQUIRES(context, ksize_.size() == 4,
183                errors::InvalidArgument("Sliding window ksize field must "
184                                        "specify 4 dimensions"));
185    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
186    OP_REQUIRES(context, stride_.size() == 4,
187                errors::InvalidArgument("Sliding window strides field must "
188                                        "specify 4 dimensions"));
189    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
190    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
191                errors::Unimplemented(
192                    "Pooling is not yet supported on the batch dimension."));
193    OP_REQUIRES(
194        context, ksize_[3] == 1 && stride_[3] == 1,
195        errors::Unimplemented(
196            "MaxPoolingGrad is not yet supported on the depth dimension."));
197  }
198
199  void Compute(OpKernelContext* context) override {
200    const Tensor& tensor_in = context->input(0);
201    const Tensor& tensor_out = context->input(1);
202    const Tensor& out_backprop = context->input(2);
203
204    // For maxpooling, tensor_in should have 4 dimensions.
205    OP_REQUIRES(context, tensor_in.dims() == 4,
206                errors::InvalidArgument("tensor_in must be 4-dimensional"));
207    OP_REQUIRES(context, tensor_out.dims() == 4,
208                errors::InvalidArgument("tensor_out must be 4-dimensional"));
209    // For maxpooling, out_backprop should have 4 dimensions.
210    OP_REQUIRES(context, out_backprop.dims() == 4,
211                errors::InvalidArgument("out_backprop must be 4-dimensional"));
212
213    TensorShape output_shape = tensor_in.shape();
214
215    // Tensor index_tensor(context->allocator(), DT_INT32, output_shape);
216
217    Tensor tensor_out_dup;
218    OP_REQUIRES_OK(context,
219                   context->allocate_temp(DataTypeToEnum<T>::v(),
220                                          tensor_out.shape(), &tensor_out_dup));
221    Tensor tensor_out_arg_max;
222    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
223                                                   tensor_out.shape(),
224                                                   &tensor_out_arg_max));
225
226    PoolParameters params{context,  ksize_,      stride_,
227                          padding_, FORMAT_NHWC, tensor_in.shape()};
228    if (!context->status().ok()) {
229      return;
230    }
231
232    Tensor* output = nullptr;
233    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
234    output->flat<T>().setZero();
235
236    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>::Compute(
237        &tensor_out_dup, &tensor_out_arg_max, tensor_in, params, padding_);
238    auto out_backprop_flat = out_backprop.flat<T>();
239    auto input_backprop_flat = output->flat<T>();
240    auto out_arg_max_flat = tensor_out_arg_max.flat<int64>();
241    int num_total_outputs = out_backprop.flat<T>().size();
242    int num_total_inputs = input_backprop_flat.size();
243
244    for (int index = 0; index < num_total_outputs; ++index) {
245      int input_backprop_index = out_arg_max_flat(index);
246      // Although this check is in the inner loop, it is worth its value
247      // so we don't end up with memory corruptions. Our benchmark shows that
248      // the performance impact is quite small
249      CHECK(input_backprop_index >= 0 &&
250            input_backprop_index < num_total_inputs)
251          << "Invalid input backprop index: " << input_backprop_index << ", "
252          << num_total_inputs;
253      input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
254    }
255  }
256
257 private:
258  std::vector<int32> ksize_;
259  std::vector<int32> stride_;
260  Padding padding_;
261  TensorFormat data_format_;
262};
263
264REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_CPU),
265                        MaxPoolingGradOp<CPUDevice, float>);
266
267#ifdef GOOGLE_CUDA
268
269static void MaxPoolingBackwardCustomKernel(
270    OpKernelContext* context, const std::vector<int32>& size,
271    const std::vector<int32>& stride, Padding padding, const Tensor* tensor_in,
272    const Tensor& out_backprop, const TensorShape& tensor_in_shape) {
273  Tensor* output = nullptr;
274
275  OP_REQUIRES_OK(context,
276                 context->allocate_output(0, tensor_in_shape, &output));
277
278  PoolParameters params{context, size,        stride,
279                        padding, FORMAT_NHWC, tensor_in_shape};
280  if (!context->status().ok()) {
281    return;
282  }
283
284  MaxPoolBackwardNoMask(
285      tensor_in->flat<float>().data(), params.tensor_in_batch,
286      params.tensor_in_rows, params.tensor_in_cols, params.depth,
287      params.out_height, params.out_width, params.window_rows,
288      params.window_cols, params.row_stride, params.col_stride, params.pad_rows,
289      params.pad_cols, out_backprop.flat<float>().data(),
290      output->flat<float>().data(), context->eigen_device<Eigen::GpuDevice>());
291}
292
293template <class T>
294class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
295 public:
296  typedef Eigen::GpuDevice Device;
297
298  explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
299    string data_format;
300    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
301    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
302                errors::InvalidArgument("Invalid data format"));
303    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
304    OP_REQUIRES(context, ksize_.size() == 4,
305                errors::InvalidArgument("Sliding window ksize field must "
306                                        "specify 4 dimensions"));
307    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
308    OP_REQUIRES(context, stride_.size() == 4,
309                errors::InvalidArgument("Sliding window strides field must "
310                                        "specify 4 dimensions"));
311    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
312    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
313    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
314    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
315                errors::Unimplemented(
316                    "Pooling is not yet supported on the batch dimension."));
317
318    use_dnn_ = CanUseCudnn();
319  }
320
321  void Compute(OpKernelContext* context) override {
322    const Tensor& tensor_in = context->input(0);
323    const Tensor& tensor_out = context->input(1);
324    const Tensor& out_backprop = context->input(2);
325
326    // For maxpooling, tensor_in should have 4 dimensions.
327    OP_REQUIRES(context, tensor_in.dims() == 4,
328                errors::InvalidArgument("tensor_in must be 4-dimensional 4"));
329    OP_REQUIRES(context, tensor_out.dims() == 4,
330                errors::InvalidArgument("tensor_out must be 4-dimensional"));
331    // For maxpooling, out_backprop should have 4 dimensions.
332    OP_REQUIRES(context, out_backprop.dims() == 4,
333                errors::InvalidArgument("out_backprop must be 4-dimensional"));
334
335    TensorShape output_shape = tensor_in.shape();
336
337    if (use_dnn_) {
338      DnnPoolingGradOp<T>::Compute(
339          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
340          stride_, padding_, data_format_, &tensor_in, &tensor_out,
341          out_backprop, output_shape);
342    } else {
343      CHECK(data_format_ == FORMAT_NHWC)
344          << "Non-Cudnn MaxPoolGrad only supports NHWC format";
345      MaxPoolingBackwardCustomKernel(context, ksize_, stride_, padding_,
346                                     &tensor_in, out_backprop, output_shape);
347    }
348  }
349
350 private:
351  std::vector<int32> ksize_;
352  std::vector<int32> stride_;
353  Padding padding_;
354  TensorFormat data_format_;
355  bool use_dnn_;
356};
357
358REGISTER_KERNEL_BUILDER(Name("MaxPoolGrad").Device(DEVICE_GPU),
359                        MaxPoolingGradOp<Eigen::GpuDevice, float>);
360
361#endif  // GOOGLE_CUDA
362
363template <typename Device, typename T>
364struct LaunchMaxPoolingNoMask;
365
366template <typename Device, typename T>
367class MaxPoolingNoMaskOp : public OpKernel {
368 public:
369  explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
370      : OpKernel(context) {
371    string data_format;
372    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
373    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
374                errors::InvalidArgument("Invalid data format"));
375    OP_REQUIRES(context, data_format_ == FORMAT_NHWC,
376                errors::InvalidArgument(
377                    "Default MaxPoolingNoMaskOp only supports NHWC."));
378    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
379    OP_REQUIRES(context, ksize_.size() == 4,
380                errors::InvalidArgument("Sliding window ksize field must "
381                                        "specify 4 dimensions"));
382    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
383    OP_REQUIRES(context, stride_.size() == 4,
384                errors::InvalidArgument("Sliding window stride field must "
385                                        "specify 4 dimensions"));
386    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
387    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
388                errors::Unimplemented(
389                    "Pooling is not yet supported on the batch dimension."));
390  }
391
392  void Compute(OpKernelContext* context) override {
393    const Tensor& tensor_in = context->input(0);
394
395    PoolParameters params{context,  ksize_,       stride_,
396                          padding_, data_format_, tensor_in.shape()};
397    if (!context->status().ok()) {
398      return;
399    }
400
401    TensorShape out_shape({params.tensor_in_batch, params.out_height,
402                           params.out_width, params.depth});
403    Tensor* output = nullptr;
404    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
405
406    LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
407                                              output);
408  }
409
410 private:
411  std::vector<int32> ksize_;
412  std::vector<int32> stride_;
413  Padding padding_;
414  TensorFormat data_format_;
415};
416
417template <typename Device, typename T>
418struct LaunchMaxPoolingWithArgmax;
419
420template <typename Device, typename T>
421class MaxPoolingWithArgmaxOp : public OpKernel {
422 public:
423  explicit MaxPoolingWithArgmaxOp(OpKernelConstruction* context)
424      : OpKernel(context) {
425    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
426    OP_REQUIRES(context, ksize_.size() == 4,
427                errors::InvalidArgument("Sliding window ksize field must "
428                                        "specify 4 dimensions"));
429    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
430    OP_REQUIRES(context, stride_.size() == 4,
431                errors::InvalidArgument("Sliding window stride field must "
432                                        "specify 4 dimensions"));
433    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
434    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
435                errors::Unimplemented(
436                    "Pooling is not yet supported on the batch dimension."));
437  }
438
439  void Compute(OpKernelContext* context) override {
440    const Tensor& tensor_in = context->input(0);
441
442    PoolParameters params{context,  ksize_,      stride_,
443                          padding_, FORMAT_NHWC, tensor_in.shape()};
444    if (!context->status().ok()) {
445      return;
446    }
447
448    TensorShape out_shape({params.tensor_in_batch, params.out_height,
449                           params.out_width, params.depth});
450    Tensor* output = nullptr;
451    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
452    Tensor* argmax = nullptr;
453    OP_REQUIRES_OK(context, context->allocate_output(1, out_shape, &argmax));
454
455    LaunchMaxPoolingWithArgmax<Device, T>::launch(context, params, tensor_in,
456                                                  output, argmax);
457  }
458
459 private:
460  std::vector<int32> ksize_;
461  std::vector<int32> stride_;
462  Padding padding_;
463};
464
465template <typename Device, typename T>
466struct LaunchMaxPoolingGradWithArgmax;
467
468template <typename Device, typename T>
469class MaxPoolingGradWithArgmaxOp : public OpKernel {
470 public:
471  explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
472      : OpKernel(context) {
473    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
474    OP_REQUIRES(context, ksize_.size() == 4,
475                errors::InvalidArgument("Sliding window ksize field must "
476                                        "specify 4 dimensions"));
477    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
478    OP_REQUIRES(context, stride_.size() == 4,
479                errors::InvalidArgument("Sliding window stride field must "
480                                        "specify 4 dimensions"));
481    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
482    OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
483                errors::Unimplemented(
484                    "Pooling is not yet supported on the batch dimension."));
485  }
486
487  void Compute(OpKernelContext* context) override {
488    const Tensor& tensor_in = context->input(0);
489    const Tensor& grad_in = context->input(1);
490    const Tensor& argmax = context->input(2);
491
492    PoolParameters params{context,  ksize_,      stride_,
493                          padding_, FORMAT_NHWC, tensor_in.shape()};
494    if (!context->status().ok()) {
495      return;
496    }
497
498    TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
499                           params.tensor_in_cols, params.depth});
500    Tensor* grad_out = nullptr;
501    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out));
502
503    LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
504                                                      argmax, grad_out);
505  }
506
507 private:
508  std::vector<int32> ksize_;
509  std::vector<int32> stride_;
510  Padding padding_;
511};
512
513#if GOOGLE_CUDA
514template <typename T>
515class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
516 public:
517  typedef GPUDevice Device;
518  explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
519      : OpKernel(context) {
520    string data_format;
521    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
522    OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
523                errors::InvalidArgument("Invalid data format"));
524    OP_REQUIRES_OK(context, context->GetAttr("ksize", &ksize_));
525    OP_REQUIRES(context, ksize_.size() == 4,
526                errors::InvalidArgument("Sliding window ksize field must "
527                                        "specify 4 dimensions"));
528    OP_REQUIRES_OK(context, context->GetAttr("strides", &stride_));
529    OP_REQUIRES(context, stride_.size() == 4,
530                errors::InvalidArgument("Sliding window stride field must "
531                                        "specify 4 dimensions"));
532    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
533    const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
534    const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
535    OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
536                errors::Unimplemented(
537                    "Pooling is not yet supported on the batch dimension."));
538    use_dnn_ = CanUseCudnn();
539  }
540
541  void Compute(OpKernelContext* context) override {
542    const Tensor& tensor_in = context->input(0);
543
544    PoolParameters params{context,  ksize_,       stride_,
545                          padding_, data_format_, tensor_in.shape()};
546    if (!context->status().ok()) {
547      return;
548    }
549
550    TensorShape out_shape =
551        ShapeFromFormat(data_format_, params.tensor_in_batch, params.out_height,
552                        params.out_width, params.depth);
553    if (use_dnn_ && data_format_ == FORMAT_NCHW) {
554      DnnPoolingOp<T>::Compute(
555          context, perftools::gputools::dnn::PoolingMode::kMaximum, ksize_,
556          stride_, padding_, data_format_, tensor_in, out_shape);
557    } else {
558      CHECK(data_format_ == FORMAT_NHWC)
559          << "Non-Cudnn MaxPool only supports NHWC format";
560      Tensor* output = nullptr;
561      OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
562      LaunchMaxPoolingNoMask<Device, T>::launch(context, params, tensor_in,
563                                                output);
564    }
565  }
566
567 private:
568  std::vector<int32> ksize_;
569  std::vector<int32> stride_;
570  Padding padding_;
571  TensorFormat data_format_;
572  bool use_dnn_;
573};
574
575template <typename T>
576struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
577  static void launch(OpKernelContext* context, const PoolParameters& params,
578                     const Tensor& input, Tensor* output) {
579    bool status = MaxPoolForwardWithOptionalArgmax(
580        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
581        params.tensor_in_cols, params.depth, params.out_height,
582        params.out_width, params.window_rows, params.window_cols,
583        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
584        output->flat<T>().data(), nullptr, context->eigen_gpu_device());
585    if (!status) {
586      context->SetStatus(
587          errors::Internal("Failed launching MaxPoolForwardNoMask"));
588    }
589  }
590};
591
592REGISTER_KERNEL_BUILDER(Name("MaxPool").Device(DEVICE_GPU),
593                        MaxPoolingNoMaskOp<Eigen::GpuDevice, float>);
594
595template <typename T>
596struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
597  static void launch(OpKernelContext* context, const PoolParameters& params,
598                     const Tensor& input, Tensor* output, Tensor* argmax) {
599    bool status = MaxPoolForwardWithOptionalArgmax(
600        input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
601        params.tensor_in_cols, params.depth, params.out_height,
602        params.out_width, params.window_rows, params.window_cols,
603        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
604        output->flat<T>().data(),
605        reinterpret_cast<int64*>(argmax->flat<int64>().data()),
606        context->eigen_gpu_device());
607    if (!status) {
608      context->SetStatus(
609          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
610    }
611  }
612};
613
614REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")
615                            .Device(DEVICE_GPU)
616                            .TypeConstraint<int64>("Targmax"),
617                        MaxPoolingWithArgmaxOp<Eigen::GpuDevice, float>);
618
619template <typename T>
620struct LaunchMaxPoolingGradWithArgmax<Eigen::GpuDevice, T> {
621  static void launch(OpKernelContext* context, const PoolParameters& params,
622                     const Tensor& grad_in, const Tensor& argmax,
623                     Tensor* grad_out) {
624    const int input_size = params.tensor_in_batch * params.tensor_in_rows *
625                           params.tensor_in_cols * params.depth;
626    const int output_size = params.tensor_in_batch * params.out_height *
627                            params.out_width * params.depth;
628    const int top_offset = params.out_height * params.out_width * params.depth;
629    const int bottom_offset =
630        params.tensor_in_rows * params.tensor_in_cols * params.depth;
631    bool status = MaxPoolBackwardWithArgmax(
632        output_size, input_size, grad_in.flat<T>().data(),
633        reinterpret_cast<const int64*>(argmax.flat<int64>().data()), top_offset,
634        bottom_offset, grad_out->flat<T>().data(), context->eigen_gpu_device());
635    if (!status) {
636      context->SetStatus(
637          errors::Internal("Failed launching MaxPoolForwardWithArgmax"));
638    }
639  }
640};
641
642REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")
643                            .Device(DEVICE_GPU)
644                            .TypeConstraint<int64>("Targmax"),
645                        MaxPoolingGradWithArgmaxOp<Eigen::GpuDevice, float>);
646
647#endif  // GOOGLE_CUDA
648
649}  // namespace tensorflow
650