1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#if GOOGLE_CUDA
17
18#define EIGEN_USE_GPU
19
20#include <stdio.h>
21#include <cfloat>
22
23#include "tensorflow/core/framework/register_types.h"
24#include "tensorflow/core/framework/tensor_types.h"
25#include "tensorflow/core/framework/type_traits.h"
26#include "tensorflow/core/kernels/maxpooling_op.h"
27#include "tensorflow/core/kernels/maxpooling_op_gpu.h"
28#include "tensorflow/core/util/cuda_kernel_helper.h"
29
30namespace tensorflow {
31namespace {
32template <bool propagate_nans, typename dtype>
33EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool IsGreaterThan(dtype a, dtype b) {
34  if (propagate_nans) {
35    return !(a <= b);
36  } else {
37    return a > b;
38  }
39}
40
41// This is Yangqing's custom kernel for the maxpooling operation. There are
42// three functions: MaxPoolForwardNCHW and MaxPoolForwardNHWC are the two
43// forward functions, dealing with the forward case. MaxPoolBackward is the
44// backward function that deals with the backward case for both storage orders.
45// The parameters to the kernels in the forward function is as follows:
46//     nthreads: the number of threads, which is equal to the output size.
47//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
48//     height, width, pooled_height, pooled_width: the input and output sizes.
49//     kernel_h, kernel_w: the kernel sizes.
50//     stride_h, stride_w: the strides.
51//     pad_t, pad_l: the padding values on the top and left side.
52//     top_data: the maxpool output.
53//     mask: the output mask of the same size as top_data. It is stored in
54//         int form, keeping track of the flattened index of the input item that
55//         produces the max output. If a nullptr is passed in for mask, no mask
56//         will be produced.
57//
58// To call the forward and backward functions, use e.g.:
59// const int kThreadsPerBlock = 1024
60// const int output_size = batch * channels * pooled_height * pooled_width;
61// MaxPoolForwardNCHW<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
62//                      kThreadsPerBlock, 0, cuda_stream>>>(...);
63template <bool propagate_nans, typename dtype>
64__global__ void MaxPoolForwardNCHW(const int nthreads, const dtype* bottom_data,
65                                   const int channels, const int height,
66                                   const int width, const int pooled_height,
67                                   const int pooled_width, const int kernel_h,
68                                   const int kernel_w, const int stride_h,
69                                   const int stride_w, const int pad_t,
70                                   const int pad_l, dtype* top_data,
71                                   int64* mask) {
72  CUDA_1D_KERNEL_LOOP(index, nthreads) {
73    int pw = index % pooled_width;
74    int ph = (index / pooled_width) % pooled_height;
75    int c = (index / pooled_width / pooled_height) % channels;
76    int n = index / pooled_width / pooled_height / channels;
77    int hstart = ph * stride_h - pad_t;
78    int wstart = pw * stride_w - pad_l;
79    int hend = min(hstart + kernel_h, height);
80    int wend = min(wstart + kernel_w, width);
81    hstart = max(hstart, 0);
82    wstart = max(wstart, 0);
83    dtype maxval = Eigen::NumTraits<dtype>::lowest();
84    int maxidx = -1;
85    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
86    for (int h = hstart; h < hend; ++h) {
87      for (int w = wstart; w < wend; ++w) {
88        int idx = c * height * width + h * width + w;
89        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
90          maxidx = idx;
91          maxval = bottom_data_n[idx];
92        }
93      }
94    }
95    top_data[index] = maxval;
96    if (mask != nullptr) {
97      mask[index] = maxidx;
98    }
99  }
100}
101
102// The parameters for MaxPoolForwardNoMaskKernel_NCHW_VECT_C are the same as for
103// MaxPoolForwardNCHW above, except that mask is not supported, and each
104// element of the input and output contains 4 adjacent channel values for
105// the same X, y coordinate.
106// (so channels = outer_channels, output_size = real output size / 4).
107__global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
108    const int nthreads, const int32* bottom_data, const int height,
109    const int width, const int channels, const int pooled_height,
110    const int pooled_width, const int kernel_h, const int kernel_w,
111    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
112    int32* top_data) {
113  // TODO(pauldonnelly): Implement a better optimized version of this kernel.
114  const int32 kMinINT8X4 = 0x80808080;
115  CUDA_1D_KERNEL_LOOP(index, nthreads) {
116    int pw = index % pooled_width;
117    int ph = (index / pooled_width) % pooled_height;
118    int c = (index / pooled_width / pooled_height) % channels;
119    int n = index / pooled_width / pooled_height / channels;
120    int hstart = ph * stride_h - pad_t;
121    int wstart = pw * stride_w - pad_l;
122    int hend = min(hstart + kernel_h, height);
123    int wend = min(wstart + kernel_w, width);
124    hstart = max(hstart, 0);
125    wstart = max(wstart, 0);
126    int32 maxval = kMinINT8X4;
127    const int32* bottom_data_n = bottom_data + n * channels * height * width;
128    for (int h = hstart; h < hend; ++h) {
129      for (int w = wstart; w < wend; ++w) {
130        int idx = (c * height + h) * width + w;
131        maxval = __vmaxs4(maxval, bottom_data_n[idx]);
132      }
133    }
134    top_data[index] = maxval;
135  }
136}
137
138template <bool propagate_nans, typename dtype>
139__global__ void MaxPoolForwardNHWC(const int nthreads, const dtype* bottom_data,
140                                   const int height, const int width,
141                                   const int channels, const int pooled_height,
142                                   const int pooled_width, const int kernel_h,
143                                   const int kernel_w, const int stride_h,
144                                   const int stride_w, const int pad_t,
145                                   const int pad_l, dtype* top_data,
146                                   int64* mask) {
147  CUDA_1D_KERNEL_LOOP(index, nthreads) {
148    int n = index;
149    int c = n % channels;
150    n /= channels;
151    int wstart = (n % pooled_width) * stride_w - pad_l;
152    n /= pooled_width;
153    int hstart = (n % pooled_height) * stride_h - pad_t;
154    n /= pooled_height;
155    int hend = min(hstart + kernel_h, height);
156    int wend = min(wstart + kernel_w, width);
157    hstart = max(hstart, 0);
158    wstart = max(wstart, 0);
159    dtype maxval = Eigen::NumTraits<dtype>::lowest();
160    int maxidx = -1;
161    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
162    for (int h = hstart; h < hend; ++h) {
163      for (int w = wstart; w < wend; ++w) {
164        int idx = (h * width + w) * channels + c;
165        if (IsGreaterThan<propagate_nans>(bottom_data_n[idx], maxval)) {
166          maxidx = idx;
167          maxval = bottom_data_n[idx];
168        }
169      }
170    }
171    top_data[index] = maxval;
172    if (mask != nullptr) {
173      mask[index] = maxidx;
174    }
175  }
176}
177
178template <typename dtype>
179__global__ void MaxPoolBackwardNoMaskNHWC(
180    const int nthreads, const dtype* bottom_data, const int height,
181    const int width, const int channels, const int pooled_height,
182    const int pooled_width, const int kernel_h, const int kernel_w,
183    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
184    const dtype* top_diff, dtype* bottom_diff) {
185  CUDA_1D_KERNEL_LOOP(index, nthreads) {
186    // First find out the index to the maximum, since we have no mask.
187    int n = index;
188    int c = n % channels;
189    n /= channels;
190    int wstart = (n % pooled_width) * stride_w - pad_l;
191    n /= pooled_width;
192    int hstart = (n % pooled_height) * stride_h - pad_t;
193    n /= pooled_height;
194    int hend = min(hstart + kernel_h, height);
195    int wend = min(wstart + kernel_w, width);
196    hstart = max(hstart, 0);
197    wstart = max(wstart, 0);
198    dtype maxval = Eigen::NumTraits<dtype>::lowest();
199    int maxidx = -1;
200    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
201    for (int h = hstart; h < hend; ++h) {
202      for (int w = wstart; w < wend; ++w) {
203        int idx = (h * width + w) * channels + c;
204        if (bottom_data_n[idx] > maxval) {
205          maxidx = idx;
206          maxval = bottom_data_n[idx];
207        }
208      }
209    }
210
211    // Atomically accumulate the bottom diff. The index could still be
212    // uninitialized, if all the bottom_data are NaN.
213    if (maxidx != -1) {
214      CudaAtomicAdd(bottom_diff + n * height * width * channels + maxidx,
215                    top_diff[index]);
216    }
217  }
218}
219
220// The parameters to the kernels in the backward function is as follows:
221//     nthreads: the number of threads, which is equal to the output size.
222//     top_diff: the gradient of the output data, of size N*Hout*Wout*C (or
223//        N*C*Hout*Wout). As we have stored the flattened index of the input
224//        entries, the backward function is agnostic of the input storage order.
225//     mask: the output mask of the same size as top_data. It is stored in
226//         int form, keeping track of the flattened index of the input item that
227//         produces the max output.
228//     top_offset: the pre-computed per-image offset of the maxpool output. This
229//         is equal to Hout*Wout*C. We choose to pre-compute this so we do not
230//         need to compute it every time inside the kernel.
231//     bottom_offset: the pre-computed per-image offset of the maxpool input.
232//         This is equal to H*W*C.
233//     bottom_diff: the gradient with respect to the input.
234// This function relies on CudaAtomicAdd to avoid race conditions. Also, before
235// the kernel is run, you will need to make sure that bottom_diff is filled with
236// zero first.
237template <typename dtype>
238__global__ void MaxPoolBackward(const int nthreads, const dtype* top_diff,
239                                const int64* mask, const int top_offset,
240                                const int bottom_offset, dtype* bottom_diff) {
241  CUDA_1D_KERNEL_LOOP(index, nthreads) {
242    int image_id = (index / top_offset);
243    CudaAtomicAdd(bottom_diff + image_id * bottom_offset + mask[index],
244                  top_diff[index]);
245  }
246}
247
248// The parameters to the kernels in the gradient gradient function is as
249// follows:
250//     nthreads: the number of threads, which is equal to the output size. The
251//         gradient of the MaxPooling gradient w.r.t. the output data has a
252//         dimensions of N*C*Hout*Wout
253//     bottom_data: the bottom data of N*H*W*C (or N*C*H*W) items.
254//     output_data: the output data of N*Hout*Wout*C (or N*C*Hout*Wout) items.
255//     height, width, pooled_height, pooled_width: the input and output sizes.
256//     kernel_h, kernel_w: the kernel sizes.
257//     stride_h, stride_w: the strides.
258//     pad_t, pad_l: the padding values on the top and left side.
259//     top_diff: the gradient of the gradient of the output data w.r.t. the
260//         input data, of size N*H*W*C (or N*C*H*W).
261//     bottom_diff: the gradient of the gradient w.r.t. output.
262template <typename dtype>
263__global__ void MaxPoolGradBackwardNoMaskNCHW(
264    const int nthreads, const dtype* bottom_data, const dtype* output_data,
265    const int pooled_height, const int pooled_width, const int channels,
266    const int height, const int width, const int kernel_h, const int kernel_w,
267    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
268    const dtype* top_diff, dtype* bottom_diff) {
269  CUDA_1D_KERNEL_LOOP(index, nthreads) {
270    // First find out the index to the maximum, since we have no mask.
271    int pw = index % pooled_width;
272    int ph = (index / pooled_width) % pooled_height;
273    int c = (index / pooled_width / pooled_height) % channels;
274    int n = index / pooled_width / pooled_height / channels;
275    int hstart = ph * stride_h - pad_t;
276    int wstart = pw * stride_w - pad_l;
277    const int hend = min(hstart + kernel_h, height);
278    const int wend = min(wstart + kernel_w, width);
279    hstart = max(hstart, 0);
280    wstart = max(wstart, 0);
281    bool should_stop = false;
282    int maxidx = -1;
283    const dtype* bottom_data_n = bottom_data + n * channels * height * width;
284    // Propagate only first value from top_diff corresponding to the maximum.
285    for (int h = hstart; h < hend && !should_stop; ++h) {
286      for (int w = wstart; w < wend && !should_stop; ++w) {
287        int idx = c * height * width + h * width + w;
288        if (output_data[index] == bottom_data_n[idx]) {
289          maxidx = idx;
290          should_stop = true;
291        }
292      }
293    }
294    // Set the bottom diff (atomic is not necessary). The index could still be
295    // uninitialized, if all the bottom_data are NaN.
296    if (maxidx != -1) {
297      bottom_diff[index] = top_diff[n * channels * height * width + maxidx];
298    }
299  }
300}
301
302template <typename dtype>
303__global__ void MaxPoolGradBackwardNoMaskNHWC(
304    const int nthreads, const dtype* bottom_data, const dtype* output_data,
305    const int pooled_height, const int pooled_width, const int channels,
306    const int height, const int width, const int kernel_h, const int kernel_w,
307    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
308    const dtype* top_diff, dtype* bottom_diff) {
309  CUDA_1D_KERNEL_LOOP(index, nthreads) {
310    // First find out the index to the maximum, since we have no mask.
311    int n = index;
312    int c = n % channels;
313    n /= channels;
314    int wstart = (n % pooled_width) * stride_w - pad_l;
315    n /= pooled_width;
316    int hstart = (n % pooled_height) * stride_h - pad_t;
317    n /= pooled_height;
318    int hend = min(hstart + kernel_h, height);
319    int wend = min(wstart + kernel_w, width);
320    hstart = max(hstart, 0);
321    wstart = max(wstart, 0);
322    bool should_stop = false;
323    int maxidx = -1;
324    const dtype* bottom_data_n = bottom_data + n * height * width * channels;
325    // Propagate only first value from top_diff corresponding to the maximum.
326    for (int h = hstart; h < hend && !should_stop; ++h) {
327      for (int w = wstart; w < wend && !should_stop; ++w) {
328        int idx = (h * width + w) * channels + c;
329        if (output_data[index] == bottom_data_n[idx]) {
330          maxidx = idx;
331          should_stop = true;
332        }
333      }
334    }
335    // Set the bottom diff (atomic is not necessary). The index could still be
336    // uninitialized, if all the bottom_data are NaN.
337    if (maxidx != -1) {
338      bottom_diff[index] = top_diff[n * height * width * channels + maxidx];
339    }
340  }
341}
342
343// The parameters to the kernels in the gradient gradient function is as
344// follows:
345//     nthreads: the number of threads, which is equal to the output size. The
346//         gradient of the MaxPooling gradient w.r.t. the output data has a
347//         dimensions of N*C*Hout*Wout
348//     top_diff: the gradient of the gradient of the output data w.r.t. the
349//         input data, of size N*H*W*C (or N*C*H*W). As we have stored the
350//         flattened index of the input entries, the backward function is
351//         agnostic of the input storage order.
352//     mask: the output mask of the same size as top_data. It is stored in
353//         int form, keeping track of the flattened index of the input item that
354//         produces the max output.
355//     top_offset: the pre-computed per-image offset of the maxpool input
356//         gradient. This is equal to H*W*C. We choose to pre-compute this so we
357//         do not  need to compute it every time inside the kernel.
358//     bottom_offset: the pre-computed per-image offset of the maxpool output.
359//         This is equal to Hout*Wout*C.
360//     bottom_diff: the gradient of the gradient w.r.t. output.
361template <typename dtype>
362__global__ void MaxPoolGradBackward(const int nthreads, const dtype* top_diff,
363                                    const int64* mask, const int top_offset,
364                                    const int bottom_offset,
365                                    dtype* bottom_diff) {
366  CUDA_1D_KERNEL_LOOP(index, nthreads) {
367    int image_id = (index / bottom_offset);
368    bottom_diff[index] = top_diff[image_id * top_offset + mask[index]];
369  }
370}
371
372#undef CUDA_1D_KERNEL_LOOP
373}  // namespace
374
375namespace functor {
376
377// Note: channels is the outer channels (dim 1) which has already been
378// divided by 4.
379bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
380    const int32* bottom_data, const int batch, const int height,
381    const int width, int channels, const int pooled_height,
382    const int pooled_width, const int kernel_h, const int kernel_w,
383    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
384    int32* top_data, const Eigen::GpuDevice& d) {
385  const int kThreadsPerBlock = 1024;
386  const int output_size = batch * channels * pooled_height * pooled_width;
387  MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<<
388      (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
389      0, d.stream()>>>(output_size, bottom_data, height, width, channels,
390                       pooled_height, pooled_width, kernel_h, kernel_w,
391                       stride_h, stride_w, pad_t, pad_l, top_data);
392  d.synchronize();
393  return d.ok();
394}
395
396template <typename T>
397bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
398    const T* bottom_data, const int batch, const int height, const int width,
399    const int channels, const int pooled_height, const int pooled_width,
400    const int kernel_h, const int kernel_w, const int stride_h,
401    const int stride_w, const int pad_t, const int pad_l, T* top_data,
402    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
403  const int kThreadsPerBlock = 1024;
404  const int output_size = batch * channels * pooled_height * pooled_width;
405  if (propagate_nans) {
406    MaxPoolForwardNHWC<true>
407        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
408           kThreadsPerBlock, 0, d.stream()>>>(
409            output_size, bottom_data, height, width, channels, pooled_height,
410            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
411            top_data, mask);
412  } else {
413    MaxPoolForwardNHWC<false>
414        <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
415           kThreadsPerBlock, 0, d.stream()>>>(
416            output_size, bottom_data, height, width, channels, pooled_height,
417            pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
418            top_data, mask);
419  }
420  return d.ok();
421}
422
423template <typename T>
424bool MaxPoolBackwardNoMask<T>::operator()(
425    const T* bottom_data, const int batch, const int height, const int width,
426    const int channels, const int pooled_height, const int pooled_width,
427    const int kernel_h, const int kernel_w, const int stride_h,
428    const int stride_w, const int pad_t, const int pad_l, const T* top_diff,
429    T* bottom_diff, const Eigen::GpuDevice& d) {
430  const int kThreadsPerBlock = 1024;
431
432  const int bottom_size = batch * channels * height * width;
433  SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
434            kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
435
436  const int top_size = batch * channels * pooled_height * pooled_width;
437  MaxPoolBackwardNoMaskNHWC<<<(top_size + kThreadsPerBlock - 1) /
438                                  kThreadsPerBlock,
439                              kThreadsPerBlock, 0, d.stream()>>>(
440      top_size, bottom_data, height, width, channels, pooled_height,
441      pooled_width, kernel_h, kernel_w, stride_h, stride_w, pad_t, pad_l,
442      top_diff, bottom_diff);
443  return d.ok();
444}
445
446template <typename T>
447bool MaxPoolBackwardWithArgmax<T>::operator()(
448    const int output_size, const int input_size, const T* top_diff,
449    const int64* mask, const int top_offset, const int bottom_offset,
450    T* bottom_diff, const Eigen::GpuDevice& d) {
451  const int kThreadsPerBlock = 1024;
452  SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
453            kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
454  MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
455                    kThreadsPerBlock, 0, d.stream()>>>(
456      output_size, top_diff, mask, top_offset, bottom_offset, bottom_diff);
457  return d.ok();
458}
459
460template <typename T>
461bool MaxPoolGradBackwardNoMask<T>::operator()(
462    TensorFormat data_format, const T* bottom_data, const T* output_data,
463    const int batch, const int pooled_height, const int pooled_width,
464    const int channels, const int height, const int width, const int kernel_h,
465    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
466    const int pad_l, const T* top_diff, T* bottom_diff,
467    const Eigen::GpuDevice& d) {
468  const int num_kernels = batch * channels * pooled_height * pooled_width;
469  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
470
471  if (data_format == FORMAT_NHWC) {
472    MaxPoolGradBackwardNoMaskNHWC<<<config.block_count, config.thread_per_block,
473                                    0, d.stream()>>>(
474        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
475        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
476        pad_l, top_diff, bottom_diff);
477  } else {
478    MaxPoolGradBackwardNoMaskNCHW<<<config.block_count, config.thread_per_block,
479                                    0, d.stream()>>>(
480        num_kernels, bottom_data, output_data, pooled_height, pooled_width,
481        channels, height, width, kernel_h, kernel_w, stride_h, stride_w, pad_t,
482        pad_l, top_diff, bottom_diff);
483  }
484  return d.ok();
485}
486
487template <typename T>
488bool MaxPoolGradBackwardWithArgmax<T>::operator()(
489    const int output_size, const int input_size, const T* top_diff,
490    const int64* mask, const int top_offset, const int bottom_offset,
491    T* bottom_diff, const Eigen::GpuDevice& d) {
492  CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
493  MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
494                        d.stream()>>>(output_size, top_diff, mask, top_offset,
495                                      bottom_offset, bottom_diff);
496  return d.ok();
497}
498
499typedef Eigen::GpuDevice GPUDevice;
500
501#define DEFINE_GPU_KERNELS(T)                          \
502  template struct SpatialMaxPooling<GPUDevice, T>;     \
503  template struct MaxPoolForwardWithOptionalArgmax<T>; \
504  template struct MaxPoolBackwardWithArgmax<T>;        \
505  template struct MaxPoolBackwardNoMask<T>;            \
506  template struct MaxPoolGradBackwardWithArgmax<T>;    \
507  template struct MaxPoolGradBackwardNoMask<T>;
508
509TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
510
511#undef DEFINE_GPU_KERNELS
512
513}  // namespace functor
514
515}  // end namespace tensorflow
516
517#endif  // GOOGLE_CUDA
518