1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
17#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
18
19#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20#include "tensorflow/core/framework/types.h"
21#include "tensorflow/core/util/tensor_format.h"
22
23namespace tensorflow {
24
25struct DepthwiseArgs {
26  // Input layer dimensions
27  int batch;
28  int in_rows;
29  int in_cols;
30  int in_depth;
31  int filter_rows;
32  int filter_cols;
33  int depth_multiplier;
34  int stride;
35  int pad_rows;
36  int pad_cols;
37
38  // Output layer dimensions
39  int out_rows;
40  int out_cols;
41  int out_depth;
42
43  DepthwiseArgs()
44      : batch(0),
45        in_rows(0),
46        in_cols(0),
47        in_depth(0),
48        filter_rows(0),
49        filter_cols(0),
50        depth_multiplier(0),
51        stride(0),
52        pad_rows(0),
53        pad_cols(0),
54        out_rows(0),
55        out_cols(0),
56        out_depth(0) {}
57};
58
59// Forward declaration.
60class OpKernelContext;
61
62template <typename Device, typename T>
63struct LaunchDepthwiseConvOp {
64  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
65                  const T* input, const T* filter, T* output,
66                  TensorFormat data_format);
67};
68
69template <typename Device, typename T>
70struct LaunchDepthwiseConvBackpropInputOp {
71  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
72                  const T* out_backprop, const T* filter, T* in_backprop,
73                  TensorFormat data_format);
74};
75
76template <typename Device, typename T>
77struct LaunchDepthwiseConvBackpropFilterOp {
78  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
79                  const T* out_backprop, const T* input, T* filter_backprop,
80                  TensorFormat data_format);
81};
82
83#if GOOGLE_CUDA
84template <typename T>
85struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
86  void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
87                  const T* input, const T* filter, T* output,
88                  TensorFormat data_format);
89};
90
91template <typename T>
92struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> {
93  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
94                  const T* out_backprop, const T* filter, T* in_backprop,
95                  TensorFormat data_format);
96};
97
98template <typename T>
99struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> {
100  void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
101                  const T* out_backprop, const T* input, T* filter_backprop,
102                  TensorFormat data_format);
103};
104#endif
105
106}  // namespace tensorflow
107
108namespace tensorflow {
109namespace functor {
110
111// Pads 'filter' to vector-register boundary along its inner dimension:
112//   filter_inner_dim_size = in_depth * depth_multiplier
113// Requires 'filter' to have the following storage order:
114//   [filter_rows, filter_cols, in_depth, depth_multiplier]
115// Returns zero-padded filter in 'padded_filter'.
116//
117// EX:
118//   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
119//   So we have a total of 3 * 2 = 6 filters, each of spatial size 2 x 2.
120//
121//   filter [rows, cols, in_depth, depth_multiplier]
122//     [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1]
123//     [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3]
124//
125//   padded_filter [rows, cols, in_depth, depth_multiplier]
126//     [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
127//     [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
128
129template <typename T>
130struct DepthwiseFilterPadOp {
131  void operator()(const DepthwiseArgs& args, const T* filter,
132                  T* padded_filter) {
133    typedef typename Eigen::internal::packet_traits<T>::type Packet;
134    static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
135
136    // Calculate vectorized and scalar lengths of filter's inner dimension.
137    const int64 filter_inner_dim_size = args.out_depth;
138    const int64 vectorized_size =
139        (filter_inner_dim_size / kPacketSize) * kPacketSize;
140    const int64 scalar_size = filter_inner_dim_size - vectorized_size;
141    // Calculate required padding and padded output buffer stride.
142    const int64 pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0;
143    const int64 padded_filter_stride = vectorized_size + kPacketSize;
144
145    const int64 filter_spatial_size = args.filter_rows * args.filter_cols;
146    for (int64 i = 0; i < filter_spatial_size; ++i) {
147      const int64 input_base = i * filter_inner_dim_size;
148      const int64 output_base = i * padded_filter_stride;
149      // Write vectorized length of filter's inner dimension to output.
150      for (int64 j = 0; j < vectorized_size; j += kPacketSize) {
151        const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j);
152        Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v);
153      }
154      // Write scalar length of filter's inner dimension to output.
155      for (int64 j = 0; j < scalar_size; ++j) {
156        padded_filter[output_base + vectorized_size + j] =
157            filter[input_base + vectorized_size + j];
158      }
159      // Pad the remainder of output to vector-register boundary.
160      for (int64 j = 0; j < pad_size; ++j) {
161        padded_filter[output_base + vectorized_size + scalar_size + j] =
162            static_cast<T>(0);
163      }
164    }
165  }
166};
167
168// Copies data from local region in 'input' specified by 'out_r' and 'out_'c'
169// to 'input_buffer'. The copied data is replicated by factor
170// 'args.depth_mulitplier', and padded to vector register-width boundaries so
171// that it is aligned for efficient traversal and vector multiply-add by the
172// depthwise kernel.
173//
174// EX:
175//   in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176//
177//   input: [batch, in_rows, in_cols, in_depth]
178//
179//     [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...]
180//
181//   input_buffer (register boundaries shown):
182//     [a0, a0, a1, a1] [a2, a2, 0, 0]   in_row = 0, in_col = 0
183//     [b0, b0, b1, b1] [b2, b2, 0, 0]   in_row = 0, in_col = 1
184//     [e0, e0, e1, e1] [e2, e2, 0, 0]   in_row = 1, in_col = 0
185//     [f0, f0, f1, f1] [f2, f2, 0, 0]   in_row = 1, in_col = 1
186//
187// Returns replicated and padded data from specified input region in
188// 'input_buffer'.
189
190template <typename T>
191struct DepthwiseInputCopyOp {
192  void operator()(const DepthwiseArgs& args,
193                  const int64 padded_filter_inner_dim_size, const int64 out_r,
194                  const int64 out_c, const T* input, T* input_buffer) {
195    typedef typename Eigen::internal::packet_traits<T>::type Packet;
196    static const int64 kPacketSize = (sizeof(Packet) / sizeof(T));
197
198    // Calculate vectorized and scalar (residual) lengths for 'in_depth'.
199    const int64 input_vectorized_size =
200        (args.in_depth / kPacketSize) * kPacketSize;
201    const int64 input_scalar_size = args.in_depth % kPacketSize;
202
203    // Calculate vectorized and scalar (residual) lengths for
204    // 'depth_multiplier'. This is used to efficiently replicate data for
205    // when 'depth_multiplier' > kPacketSize.
206    const int64 dm_vectorized_size =
207        (args.depth_multiplier / kPacketSize) * kPacketSize;
208    const int64 dm_scalar_size = args.depth_multiplier % kPacketSize;
209
210    // Calculate output padding length.
211    const int64 output_scalar_size = args.out_depth % kPacketSize;
212    const int64 output_pad_size =
213        output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0;
214
215    const int64 replicated_packet_size = kPacketSize * args.depth_multiplier;
216
217    // Iterate through all rows x cols reading 'in_depth' from 'input' and
218    // replicating by 'depth_multiplier' into 'input_buffer' (otherwise
219    // zero-padding input buffer as needed).
220    auto* in_buf = input_buffer;
221    const int64 in_r_start = out_r * args.stride - args.pad_rows;
222    const int64 in_c_start = out_c * args.stride - args.pad_cols;
223
224    for (int64 f_r = 0; f_r < args.filter_rows; ++f_r) {
225      const int64 in_r = in_r_start + f_r;
226
227      for (int64 f_c = 0; f_c < args.filter_cols; ++f_c) {
228        const int64 in_c = in_c_start + f_c;
229
230        if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 &&
231            in_c < args.in_cols) {
232          auto* in = input + (in_r * args.in_cols + in_c) * args.in_depth;
233          // Copy vectorized portion of inner dimension.
234          for (int64 d = 0; d < input_vectorized_size; d += kPacketSize) {
235            auto v = Eigen::internal::ploadu<Packet>(in + d);
236            for (int dm = 0; dm < args.depth_multiplier; ++dm) {
237              Eigen::internal::pscatter<T, Packet>(in_buf + dm, v,
238                                                   args.depth_multiplier);
239            }
240            in_buf += replicated_packet_size;
241          }
242
243          // Copy scalar portion of inner dimension.
244          for (int64 d = 0; d < input_scalar_size; ++d) {
245            T v = in[input_vectorized_size + d];
246            const int64 base = d * args.depth_multiplier;
247            if (dm_vectorized_size > 0) {
248              // Copy vectorized portion of replicated output.
249              // This branch is only taken if 'args.depth_multiplier' is
250              // vectorizable (i.e. args.depth_multiplier >= register width).
251              auto p = Eigen::internal::pset1<Packet>(v);
252              for (int64 dm = 0; dm < dm_vectorized_size; dm += kPacketSize) {
253                Eigen::internal::pstoreu<T>(in_buf + base + dm, p);
254              }
255              // Copy scalar portion of replicated output.
256              for (int64 dm = 0; dm < dm_scalar_size; ++dm) {
257                in_buf[base + dm_vectorized_size + dm] = v;
258              }
259            } else {
260              // Depth multiplier is less than one packet: scalar copy.
261              for (int dm = 0; dm < args.depth_multiplier; ++dm) {
262                in_buf[base + dm] = v;
263              }
264            }
265          }
266          in_buf += input_scalar_size * args.depth_multiplier;
267
268          // Pad the remainder of the output to vector register boundary.
269          for (int64 d = 0; d < output_pad_size; ++d) {
270            in_buf[d] = static_cast<T>(0);
271          }
272          in_buf += output_pad_size;
273
274        } else {
275          // Zero pad.
276          memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size);
277          in_buf += padded_filter_inner_dim_size;
278        }
279      }
280    }
281  }
282};
283
284}  // namespace functor
285}  // namespace tensorflow
286
287#endif  // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
288