reference_ops.h revision 6dfe00ca4114371ed47c93810219736e3deda2d2
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
16#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
17
18#include <stdint.h>
19#include <sys/types.h>
20#include <algorithm>
21#include <cmath>
22#include <limits>
23#include <memory>
24#include <type_traits>
25
26#include "third_party/eigen3/Eigen/Core"
27#include "fixedpoint/fixedpoint.h"
28#include "public/gemmlowp.h"
29#include "tensorflow/contrib/lite/kernels/internal/common.h"
30#include "tensorflow/contrib/lite/kernels/internal/round.h"
31#include "tensorflow/contrib/lite/kernels/internal/types.h"
32
33namespace tflite {
34namespace reference_ops {
35
36inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
37    int32 x, int32 quantized_multiplier, int right_shift) {
38  using gemmlowp::RoundingDivideByPOT;
39  using gemmlowp::SaturatingRoundingDoublingHighMul;
40  return RoundingDivideByPOT(
41      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
42}
43
44inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
45    int32 x, int32 quantized_multiplier, int left_shift) {
46  using gemmlowp::SaturatingRoundingDoublingHighMul;
47  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
48                                           quantized_multiplier);
49}
50
51template <typename T>
52int CountLeadingZeros(T integer_input) {
53  static_assert(std::is_unsigned<T>::value,
54                "Only unsigned integer types handled.");
55  const T one_in_leading_positive = static_cast<T>(1)
56                                    << (std::numeric_limits<T>::digits - 1);
57  int leading_zeros = 0;
58  while (integer_input < one_in_leading_positive) {
59    integer_input <<= 1;
60    ++leading_zeros;
61  }
62  return leading_zeros;
63}
64
65// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
66// BROADCASTING.
67//
68// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
69// rectangular array of numbers.
70//
71// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
72// However, as Dims<N> is to be deprecated, this class exists as an adaptor
73// to enable simple unoptimized implementations of element-wise broadcasting
74// operations.
75template <int N>
76struct NdArrayDesc {
77  // The "extent" of each dimension. Indices along dimension d must be in the
78  // half-open interval [0, extents[d]).
79  int extents[N];
80
81  // The number of *elements* (not bytes) between consecutive indices of each
82  // dimension.
83  int strides[N];
84};
85
86// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
87// ELEMENT-WISE BROADCASTING.
88//
89// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
90inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
91                            int i3) {
92  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
93  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
94  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
95  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
96  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
97         i3 * desc.strides[3];
98}
99
100// Given the dimensions of the operands for an element-wise binary broadcast,
101// adjusts them so that they can be directly iterated over with simple loops.
102// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
103// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
104//
105// This function assumes that the two input shapes are compatible up to
106// broadcasting and the shorter one has already been prepended with 1s to be the
107// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
108// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
109// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
110// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
111//
112// When two shapes are compatible up to broadcasting, for each dimension d,
113// the input extents are either equal, or one of them is 1.
114//
115// This function performs the following for each dimension d:
116// - If the extents are equal, then do nothing since the loop that walks over
117//   both of the input arrays is correct.
118// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
119//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
120//   array0 to be referenced *at any index* in dimension d and still access the
121//   same slice.
122template <int N>
123inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
124                                                const Dims<N>& input1_dims,
125                                                NdArrayDesc<N>* desc0_out,
126                                                NdArrayDesc<N>* desc1_out) {
127  TFLITE_DCHECK(desc0_out != nullptr);
128  TFLITE_DCHECK(desc1_out != nullptr);
129
130  // Copy dims to desc.
131  for (int i = 0; i < N; ++i) {
132    desc0_out->extents[i] = input0_dims.sizes[i];
133    desc0_out->strides[i] = input0_dims.strides[i];
134    desc1_out->extents[i] = input1_dims.sizes[i];
135    desc1_out->strides[i] = input1_dims.strides[i];
136  }
137
138  // Walk over each dimension. If the extents are equal do nothing.
139  // Otherwise, set the desc with extent 1 to have extent equal to the other and
140  // stride 0.
141  for (int i = 0; i < N; ++i) {
142    const int extent0 = ArraySize(input0_dims, i);
143    const int extent1 = ArraySize(input1_dims, i);
144    if (extent0 != extent1) {
145      if (extent0 == 1) {
146        desc0_out->strides[i] = 0;
147        desc0_out->extents[i] = extent1;
148      } else {
149        TFLITE_DCHECK_EQ(extent1, 1);
150        desc1_out->strides[i] = 0;
151        desc1_out->extents[i] = extent0;
152      }
153    }
154  }
155}
156
157inline void Conv(const float* input_data, const Dims<4>& input_dims,
158                 const float* filter_data, const Dims<4>& filter_dims,
159                 const float* bias_data, const Dims<4>& bias_dims,
160                 int stride_width, int stride_height, int pad_width,
161                 int pad_height, float output_activation_min,
162                 float output_activation_max, float* output_data,
163                 const Dims<4>& output_dims, float* im2col_data,
164                 const Dims<4>& im2col_dims) {
165  (void)im2col_data;  // only used in optimized code.
166  (void)im2col_dims;  // only used in optimized code.
167  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
168  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
169  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
170  if (bias_data) {
171    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
172  }
173  const int input_height = ArraySize(input_dims, 2);
174  const int input_width = ArraySize(input_dims, 1);
175  const int filter_height = ArraySize(filter_dims, 2);
176  const int filter_width = ArraySize(filter_dims, 1);
177  const int output_height = ArraySize(output_dims, 2);
178  const int output_width = ArraySize(output_dims, 1);
179  for (int batch = 0; batch < batches; ++batch) {
180    for (int out_y = 0; out_y < output_height; ++out_y) {
181      for (int out_x = 0; out_x < output_width; ++out_x) {
182        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
183          const int in_x_origin = (out_x * stride_width) - pad_width;
184          const int in_y_origin = (out_y * stride_height) - pad_height;
185          float total = 0.f;
186          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
187            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
188              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
189                const int in_x = in_x_origin + filter_x;
190                const int in_y = in_y_origin + filter_y;
191                // If the location is outside the bounds of the input image,
192                // use zero as a default value.
193                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
194                    (in_y < input_height)) {
195                  float input_value = input_data[Offset(input_dims, in_channel,
196                                                        in_x, in_y, batch)];
197                  float filter_value =
198                      filter_data[Offset(filter_dims, in_channel, filter_x,
199                                         filter_y, out_channel)];
200                  total += (input_value * filter_value);
201                }
202              }
203            }
204          }
205          float bias_value = 0.0f;
206          if (bias_data) {
207            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
208          }
209          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
210              ActivationFunctionWithMinMax(total + bias_value,
211                                           output_activation_min,
212                                           output_activation_max);
213        }
214      }
215    }
216  }
217}
218
219// legacy, for compatibility with old checked-in code
220template <FusedActivationFunctionType Ac>
221void Conv(const float* input_data, const Dims<4>& input_dims,
222          const float* filter_data, const Dims<4>& filter_dims,
223          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
224          int stride_height, int pad_width, int pad_height, float* output_data,
225          const Dims<4>& output_dims, float* im2col_data,
226          const Dims<4>& im2col_dims) {
227  float output_activation_min, output_activation_max;
228  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
229  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
230       stride_width, stride_height, pad_width, pad_height,
231       output_activation_min, output_activation_max, output_data, output_dims,
232       im2col_data, im2col_dims);
233}
234
235// legacy, for compatibility with old checked-in code
236template <FusedActivationFunctionType Ac>
237void Conv(const float* input_data, const Dims<4>& input_dims,
238          const float* filter_data, const Dims<4>& filter_dims,
239          const float* bias_data, const Dims<4>& bias_dims, int stride,
240          int pad_width, int pad_height, float* output_data,
241          const Dims<4>& output_dims, float* im2col_data,
242          const Dims<4>& im2col_dims) {
243  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
244           bias_dims, stride, stride, pad_width, pad_height, output_data,
245           output_dims, im2col_data, im2col_dims);
246}
247
248inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
249                 int32 input_offset, const uint8* filter_data,
250                 const Dims<4>& filter_dims, int32 filter_offset,
251                 const int32* bias_data, const Dims<4>& bias_dims,
252                 int stride_width, int stride_height, int pad_width,
253                 int pad_height, int32 output_offset, int32 output_multiplier,
254                 int output_shift, int32 output_activation_min,
255                 int32 output_activation_max, uint8* output_data,
256                 const Dims<4>& output_dims, uint8* im2col_data,
257                 const Dims<4>& im2col_dims,
258                 gemmlowp::GemmContext* gemm_context) {
259  (void)im2col_data;   // only used in optimized code.
260  (void)im2col_dims;   // only used in optimized code.
261  (void)gemm_context;  // only used in optimized code.
262  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
263  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
264  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
265  const int output_depth =
266      MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0);
267  const int input_height = ArraySize(input_dims, 2);
268  const int input_width = ArraySize(input_dims, 1);
269  const int filter_height = ArraySize(filter_dims, 2);
270  const int filter_width = ArraySize(filter_dims, 1);
271  const int output_height = ArraySize(output_dims, 2);
272  const int output_width = ArraySize(output_dims, 1);
273  for (int batch = 0; batch < batches; ++batch) {
274    for (int out_y = 0; out_y < output_height; ++out_y) {
275      for (int out_x = 0; out_x < output_width; ++out_x) {
276        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
277          const int in_x_origin = (out_x * stride_width) - pad_width;
278          const int in_y_origin = (out_y * stride_height) - pad_height;
279          int32 acc = 0;
280          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
281            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
282              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
283                const int in_x = in_x_origin + filter_x;
284                const int in_y = in_y_origin + filter_y;
285                // If the location is outside the bounds of the input image,
286                // use zero as a default value.
287                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
288                    (in_y < input_height)) {
289                  int32 input_val = input_data[Offset(input_dims, in_channel,
290                                                      in_x, in_y, batch)];
291                  int32 filter_val =
292                      filter_data[Offset(filter_dims, in_channel, filter_x,
293                                         filter_y, out_channel)];
294                  acc +=
295                      (filter_val + filter_offset) * (input_val + input_offset);
296                }
297              }
298            }
299          }
300          if (bias_data) {
301            acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
302          }
303          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
304              acc, output_multiplier, output_shift);
305          acc += output_offset;
306          acc = std::max(acc, output_activation_min);
307          acc = std::min(acc, output_activation_max);
308          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
309              static_cast<uint8>(acc);
310        }
311      }
312    }
313  }
314}
315
316// legacy, for compatibility with old checked-in code
317template <FusedActivationFunctionType Ac>
318inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
319                 int32 input_offset, const uint8* filter_data,
320                 const Dims<4>& filter_dims, int32 filter_offset,
321                 const int32* bias_data, const Dims<4>& bias_dims,
322                 int stride_width, int stride_height, int pad_width,
323                 int pad_height, int32 output_offset, int32 output_multiplier,
324                 int output_shift, int32 output_activation_min,
325                 int32 output_activation_max, uint8* output_data,
326                 const Dims<4>& output_dims, uint8* im2col_data,
327                 const Dims<4>& im2col_dims,
328                 gemmlowp::GemmContext* gemm_context) {
329  static_assert(Ac == FusedActivationFunctionType::kNone ||
330                    Ac == FusedActivationFunctionType::kRelu ||
331                    Ac == FusedActivationFunctionType::kRelu6 ||
332                    Ac == FusedActivationFunctionType::kRelu1,
333                "");
334  if (Ac == FusedActivationFunctionType::kNone) {
335    TFLITE_DCHECK_EQ(output_activation_min, 0);
336    TFLITE_DCHECK_EQ(output_activation_max, 255);
337  }
338  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
339       filter_offset, bias_data, bias_dims, stride_width, stride_height,
340       pad_width, pad_height, output_offset, output_multiplier, output_shift,
341       output_activation_min, output_activation_max, output_data, output_dims,
342       im2col_data, im2col_dims, gemm_context);
343}
344
345// legacy, for compatibility with old checked-in code
346template <FusedActivationFunctionType Ac>
347void Conv(const uint8* input_data, const Dims<4>& input_dims,
348          int32 input_offset, const uint8* filter_data,
349          const Dims<4>& filter_dims, int32 filter_offset,
350          const int32* bias_data, const Dims<4>& bias_dims, int stride,
351          int pad_width, int pad_height, int32 output_offset,
352          int32 output_multiplier, int output_shift,
353          int32 output_activation_min, int32 output_activation_max,
354          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
355          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
356  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
357           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
358           pad_height, output_offset, output_multiplier, output_shift,
359           output_activation_min, output_activation_max, output_data,
360           output_dims, im2col_data, im2col_dims, gemm_context);
361}
362
363template <typename T>
364inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
365                         int block_size, T* output_data,
366                         const Dims<4>& output_dims) {
367  const int input_depth = ArraySize(input_dims, 0);
368  const int input_width = ArraySize(input_dims, 1);
369  const int input_height = ArraySize(input_dims, 2);
370  const int input_batch = ArraySize(input_dims, 3);
371
372  const int output_depth = ArraySize(output_dims, 0);
373  const int output_width = ArraySize(output_dims, 1);
374  const int output_height = ArraySize(output_dims, 2);
375  const int output_batch = ArraySize(output_dims, 3);
376
377  TFLITE_DCHECK_EQ(input_width * block_size, output_width);
378  TFLITE_DCHECK_EQ(input_height * block_size, output_height);
379  TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
380  TFLITE_DCHECK_EQ(input_batch, output_batch);
381
382  for (int out_b = 0; out_b < output_batch; ++out_b) {
383    for (int out_h = 0; out_h < output_height; ++out_h) {
384      for (int out_w = 0; out_w < output_width; ++out_w) {
385        for (int out_d = 0; out_d < output_depth; ++out_d) {
386          const int in_d =
387              out_d + ((out_h % block_size) * block_size + out_w % block_size) *
388                          output_depth;
389          const int in_w = out_w / block_size;
390          const int in_h = out_h / block_size;
391          const int in_b = out_b;
392
393          const int output_index =
394              Offset(output_dims, out_d, out_w, out_h, out_b);
395          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
396
397          output_data[output_index] = input_data[input_index];
398        }
399      }
400    }
401  }
402}
403
404template <typename T>
405inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
406                         int block_size, T* output_data,
407                         const Dims<4>& output_dims) {
408  const int input_depth = ArraySize(input_dims, 0);
409  const int input_width = ArraySize(input_dims, 1);
410  const int input_height = ArraySize(input_dims, 2);
411  const int input_batch = ArraySize(input_dims, 3);
412
413  const int output_depth = ArraySize(output_dims, 0);
414  const int output_width = ArraySize(output_dims, 1);
415  const int output_height = ArraySize(output_dims, 2);
416  const int output_batch = ArraySize(output_dims, 3);
417
418  TFLITE_DCHECK_EQ(input_width, output_width * block_size);
419  TFLITE_DCHECK_EQ(input_height, output_height * block_size);
420  TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
421  TFLITE_DCHECK_EQ(input_batch, output_batch);
422
423  for (int in_b = 0; in_b < input_batch; ++in_b) {
424    for (int in_h = 0; in_h < input_height; ++in_h) {
425      for (int in_w = 0; in_w < input_width; ++in_w) {
426        for (int in_d = 0; in_d < input_depth; ++in_d) {
427          const int out_d =
428              in_d + ((in_h % block_size) * block_size + in_w % block_size) *
429                         input_depth;
430          const int out_w = in_w / block_size;
431          const int out_h = in_h / block_size;
432          const int out_b = in_b;
433
434          const int output_index =
435              Offset(output_dims, out_d, out_w, out_h, out_b);
436          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
437
438          output_data[output_index] = input_data[input_index];
439        }
440      }
441    }
442  }
443}
444
445inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
446                           const float* weights_data,
447                           const Dims<4>& weights_dims, const float* bias_data,
448                           const Dims<4>& bias_dims,
449                           float output_activation_min,
450                           float output_activation_max, float* output_data,
451                           const Dims<4>& output_dims) {
452  // TODO(benoitjacob): This really should be:
453  //     const int batches = ArraySize(output_dims, 1);
454  // but the current --variable_batch hack consists in overwriting the 3rd
455  // dimension with the runtime batch size, as we don't keep track for each
456  // array of which dimension is the batch dimension in it.
457  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
458                      ArraySize(output_dims, 3);
459  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
460  const int accum_depth = ArraySize(weights_dims, 0);
461  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
462  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
463  for (int b = 0; b < batches; ++b) {
464    for (int out_c = 0; out_c < output_depth; ++out_c) {
465      float total = 0.f;
466      for (int d = 0; d < accum_depth; ++d) {
467        total += input_data[b * accum_depth + d] *
468                 weights_data[out_c * accum_depth + d];
469      }
470      float bias_value = 0.0f;
471      if (bias_data) {
472        bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
473      }
474      output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
475          total + bias_value, output_activation_min, output_activation_max);
476    }
477  }
478}
479
480// legacy, for compatibility with old checked-in code
481template <FusedActivationFunctionType Ac>
482void FullyConnected(const float* input_data, const Dims<4>& input_dims,
483                    const float* weights_data, const Dims<4>& weights_dims,
484                    const float* bias_data, const Dims<4>& bias_dims,
485                    float* output_data, const Dims<4>& output_dims) {
486  float output_activation_min, output_activation_max;
487  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
488  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
489                 bias_dims, output_activation_min, output_activation_max,
490                 output_data, output_dims);
491}
492
493inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
494                           int32 input_offset, const uint8* filter_data,
495                           const Dims<4>& filter_dims, int32 filter_offset,
496                           const int32* bias_data, const Dims<4>& bias_dims,
497                           int32 output_offset, int32 output_multiplier,
498                           int output_shift, int32 output_activation_min,
499                           int32 output_activation_max, uint8* output_data,
500                           const Dims<4>& output_dims,
501                           gemmlowp::GemmContext* gemm_context) {
502  (void)gemm_context;  // only used in optimized code.
503  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
504  // TODO(benoitjacob): This really should be:
505  //     const int batches = ArraySize(output_dims, 1);
506  // but the current --variable_batch hack consists in overwriting the 3rd
507  // dimension with the runtime batch size, as we don't keep track for each
508  // array of which dimension is the batch dimension in it.
509  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
510                      ArraySize(output_dims, 3);
511  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
512  const int accum_depth = ArraySize(filter_dims, 0);
513  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
514  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
515  for (int b = 0; b < batches; ++b) {
516    for (int out_c = 0; out_c < output_depth; ++out_c) {
517      int32 acc = 0;
518      for (int d = 0; d < accum_depth; ++d) {
519        int32 input_val = input_data[b * accum_depth + d];
520        int32 filter_val = filter_data[out_c * accum_depth + d];
521        acc += (filter_val + filter_offset) * (input_val + input_offset);
522      }
523      if (bias_data) {
524        acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
525      }
526      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
527                                                        output_shift);
528      acc += output_offset;
529      acc = std::max(acc, output_activation_min);
530      acc = std::min(acc, output_activation_max);
531      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
532    }
533  }
534}
535
536// legacy, for compatibility with old checked-in code
537template <FusedActivationFunctionType Ac>
538void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
539                    int32 input_offset, const uint8* filter_data,
540                    const Dims<4>& filter_dims, int32 filter_offset,
541                    const int32* bias_data, const Dims<4>& bias_dims,
542                    int32 output_offset, int32 output_multiplier,
543                    int output_shift, int32 output_activation_min,
544                    int32 output_activation_max, uint8* output_data,
545                    const Dims<4>& output_dims,
546                    gemmlowp::GemmContext* gemm_context) {
547  static_assert(Ac == FusedActivationFunctionType::kNone ||
548                    Ac == FusedActivationFunctionType::kRelu ||
549                    Ac == FusedActivationFunctionType::kRelu6 ||
550                    Ac == FusedActivationFunctionType::kRelu1,
551                "");
552  if (Ac == FusedActivationFunctionType::kNone) {
553    TFLITE_DCHECK_EQ(output_activation_min, 0);
554    TFLITE_DCHECK_EQ(output_activation_max, 255);
555  }
556  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
557                 filter_offset, bias_data, bias_dims, output_offset,
558                 output_multiplier, output_shift, output_activation_min,
559                 output_activation_max, output_data, output_dims, gemm_context);
560}
561
562template <FusedActivationFunctionType Ac>
563void NonGlobalBatchNormalization(
564    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
565    const Dims<4>& mean_dims, const float* multiplier_data,
566    const Dims<4>& multiplier_dims, const float* offset_data,
567    const Dims<4>& offset_dims, float* output_data,
568    const Dims<4>& output_dims) {
569  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
570  const int height =
571      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
572                        offset_dims, 2, output_dims, 2);
573  const int width =
574      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
575                        offset_dims, 1, output_dims, 1);
576  const int depth =
577      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
578                        offset_dims, 0, output_dims, 0);
579
580  for (int b = 0; b < batches; ++b) {
581    for (int y = 0; y < height; ++y) {
582      for (int x = 0; x < width; ++x) {
583        for (int c = 0; c < depth; ++c) {
584          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
585              (input_data[Offset(input_dims, c, x, y, b)] -
586               mean_data[Offset(mean_dims, c, x, y, 0)]) *
587                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
588              offset_data[Offset(offset_dims, c, x, y, 0)]);
589        }
590      }
591    }
592  }
593}
594
595template <FusedActivationFunctionType Ac>
596void GlobalBatchNormalization(const float* input_data,
597                              const Dims<4>& input_dims, const float* mean_data,
598                              const Dims<4>& mean_dims,
599                              const float* multiplier_data,
600                              const Dims<4>& multiplier_dims,
601                              const float* offset_data,
602                              const Dims<4>& offset_dims, float* output_data,
603                              const Dims<4>& output_dims) {
604  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
605  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
606  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
607  const int depth =
608      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
609                        offset_dims, 0, output_dims, 0);
610
611  for (int b = 0; b < batches; ++b) {
612    for (int y = 0; y < height; ++y) {
613      for (int x = 0; x < width; ++x) {
614        for (int c = 0; c < depth; ++c) {
615          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
616              (input_data[Offset(input_dims, c, x, y, b)] -
617               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
618                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
619              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
620        }
621      }
622    }
623  }
624}
625
626inline void Relu(const float* input_data, const Dims<4>& input_dims,
627                 float* output_data, const Dims<4>& output_dims) {
628  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
629  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
630  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
631  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
632  for (int b = 0; b < batches; ++b) {
633    for (int y = 0; y < height; ++y) {
634      for (int x = 0; x < width; ++x) {
635        for (int c = 0; c < depth; ++c) {
636          float val = input_data[Offset(input_dims, c, x, y, b)];
637          const float lower = 0;
638          float clamped = val < lower ? lower : val;
639          output_data[Offset(output_dims, c, x, y, b)] = clamped;
640        }
641      }
642    }
643  }
644}
645
646inline void Relu1(const float* input_data, const Dims<4>& input_dims,
647                  float* output_data, const Dims<4>& output_dims) {
648  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
649  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
650  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
651  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
652  for (int b = 0; b < batches; ++b) {
653    for (int y = 0; y < height; ++y) {
654      for (int x = 0; x < width; ++x) {
655        for (int c = 0; c < depth; ++c) {
656          float val = input_data[Offset(input_dims, c, x, y, b)];
657          const float upper = 1;
658          const float lower = -1;
659          float clamped = val > upper ? upper : val < lower ? lower : val;
660          output_data[Offset(output_dims, c, x, y, b)] = clamped;
661        }
662      }
663    }
664  }
665}
666
667inline void Relu6(const float* input_data, const Dims<4>& input_dims,
668                  float* output_data, const Dims<4>& output_dims) {
669  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
670  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
671  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
672  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
673  for (int b = 0; b < batches; ++b) {
674    for (int y = 0; y < height; ++y) {
675      for (int x = 0; x < width; ++x) {
676        for (int c = 0; c < depth; ++c) {
677          float val = input_data[Offset(input_dims, c, x, y, b)];
678          const float upper = 6;
679          const float lower = 0;
680          float clamped = val > upper ? upper : val < lower ? lower : val;
681          output_data[Offset(output_dims, c, x, y, b)] = clamped;
682        }
683      }
684    }
685  }
686}
687
688template <FusedActivationFunctionType Ac>
689void L2Normalization(const float* input_data, const Dims<4>& input_dims,
690                     float* output_data, const Dims<4>& output_dims) {
691  static_assert(Ac == FusedActivationFunctionType::kNone, "");
692  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
693  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
694  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
695  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
696  for (int b = 0; b < batches; ++b) {
697    for (int y = 0; y < height; ++y) {
698      for (int x = 0; x < width; ++x) {
699        float squared_l2_norm = 0;
700        for (int c = 0; c < depth; ++c) {
701          float val = input_data[Offset(input_dims, c, x, y, b)];
702          squared_l2_norm += val * val;
703        }
704        float l2_norm = std::sqrt(squared_l2_norm);
705        for (int c = 0; c < depth; ++c) {
706          output_data[Offset(output_dims, c, x, y, b)] =
707              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
708        }
709      }
710    }
711  }
712}
713
714inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
715                                          int* output_shift) {
716  *output_shift = 11;
717  while (input >= (1 << 29)) {
718    input /= 4;
719    ++*output_shift;
720  }
721  TFLITE_DCHECK_GT(input, 0);
722  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
723  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
724  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
725  *output_shift -= left_shift_bit_pairs;
726  input <<= 2 * left_shift_bit_pairs;
727  TFLITE_DCHECK_GE(input, (1 << 27));
728  TFLITE_DCHECK_LT(input, (1 << 29));
729  using gemmlowp::FixedPoint;
730  using gemmlowp::Rescale;
731  using gemmlowp::SaturatingRoundingMultiplyByPOT;
732  // Using 3 integer bits gives us enough room for the internal arithmetic in
733  // this Newton-Raphson iteration.
734  using F3 = FixedPoint<int32, 3>;
735  using F0 = FixedPoint<int32, 0>;
736  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
737  const F3 fixedpoint_half_input =
738      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
739  const F3 fixedpoint_half_three =
740      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
741  // Newton-Raphson iteration
742  // Naive unoptimized starting guess: x = 1
743  F3 x = F3::One();
744  // Naive unoptimized number of iterations: 5
745  for (int i = 0; i < 5; i++) {
746    const F3 x3 = Rescale<3>(x * x * x);
747    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
748  }
749  const F0 fixedpoint_half_sqrt_2 =
750      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
751  x = x * fixedpoint_half_sqrt_2;
752  *output_inv_sqrt = x.raw();
753  if (*output_shift < 0) {
754    *output_inv_sqrt <<= -*output_shift;
755    *output_shift = 0;
756  }
757}
758
759inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
760                            int32 input_zero_point, uint8* output_data,
761                            const Dims<4>& output_dims) {
762  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
763  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
764  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
765  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
766  TFLITE_DCHECK_EQ(batches, 1);
767  TFLITE_DCHECK_EQ(height, 1);
768  TFLITE_DCHECK_EQ(width, 1);
769  int32 square_l2_norm = 0;
770  for (int i = 0; i < depth; i++) {
771    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
772    square_l2_norm += diff * diff;
773  }
774  int32 inv_l2norm_multiplier;
775  int inv_l2norm_shift;
776  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
777                                &inv_l2norm_shift);
778
779  for (int i = 0; i < depth; i++) {
780    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
781    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
782        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
783    int32 unclamped_output_val = 128 + rescaled_diff;
784    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
785    output_data[Offset(output_dims, i, 0, 0, 0)] =
786        static_cast<uint8>(output_val);
787  }
788}
789
790inline void Add(const float* input1_data, const Dims<4>& input1_dims,
791                const float* input2_data, const Dims<4>& input2_dims,
792                float output_activation_min, float output_activation_max,
793                float* output_data, const Dims<4>& output_dims) {
794  const int batches =
795      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
796  const int height =
797      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
798  const int width =
799      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
800  const int depth =
801      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
802  for (int b = 0; b < batches; ++b) {
803    for (int y = 0; y < height; ++y) {
804      for (int x = 0; x < width; ++x) {
805        for (int c = 0; c < depth; ++c) {
806          output_data[Offset(output_dims, c, x, y, b)] =
807              ActivationFunctionWithMinMax(
808                  input1_data[Offset(input1_dims, c, x, y, b)] +
809                      input2_data[Offset(input2_dims, c, x, y, b)],
810                  output_activation_min, output_activation_max);
811        }
812      }
813    }
814  }
815}
816
817// legacy, for compatibility with old checked-in code
818template <FusedActivationFunctionType Ac>
819void Add(const float* input1_data, const Dims<4>& input1_dims,
820         const float* input2_data, const Dims<4>& input2_dims,
821         float* output_data, const Dims<4>& output_dims) {
822  float output_activation_min, output_activation_max;
823  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
824
825  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
826      output_activation_max, output_data, output_dims);
827}
828
829template <FusedActivationFunctionType Ac>
830inline void Add(int left_shift, const uint8* input1_data,
831                const Dims<4>& input1_dims, int32 input1_offset,
832                int32 input1_multiplier, int input1_shift,
833                const uint8* input2_data, const Dims<4>& input2_dims,
834                int32 input2_offset, int32 input2_multiplier, int input2_shift,
835                int32 output_offset, int32 output_multiplier, int output_shift,
836                int32 output_activation_min, int32 output_activation_max,
837                uint8* output_data, const Dims<4>& output_dims) {
838  static_assert(Ac == FusedActivationFunctionType::kNone ||
839                    Ac == FusedActivationFunctionType::kRelu ||
840                    Ac == FusedActivationFunctionType::kRelu6 ||
841                    Ac == FusedActivationFunctionType::kRelu1,
842                "");
843  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
844  if (Ac == FusedActivationFunctionType::kNone) {
845    TFLITE_DCHECK_EQ(output_activation_min, 0);
846    TFLITE_DCHECK_EQ(output_activation_max, 255);
847  }
848  const int batches =
849      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
850  const int height =
851      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
852  const int width =
853      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
854  const int depth =
855      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
856  for (int b = 0; b < batches; ++b) {
857    for (int y = 0; y < height; ++y) {
858      for (int x = 0; x < width; ++x) {
859        for (int c = 0; c < depth; ++c) {
860          const int32 input1_val =
861              input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
862          const int32 input2_val =
863              input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
864          const int32 shifted_input1_val = input1_val * (1 << left_shift);
865          const int32 shifted_input2_val = input2_val * (1 << left_shift);
866          const int32 scaled_input1_val =
867              MultiplyByQuantizedMultiplierSmallerThanOne(
868                  shifted_input1_val, input1_multiplier, input1_shift);
869          const int32 scaled_input2_val =
870              MultiplyByQuantizedMultiplierSmallerThanOne(
871                  shifted_input2_val, input2_multiplier, input2_shift);
872          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
873          const int32 raw_output =
874              MultiplyByQuantizedMultiplierSmallerThanOne(
875                  raw_sum, output_multiplier, output_shift) +
876              output_offset;
877          const int32 clamped_output =
878              std::min(output_activation_max,
879                       std::max(output_activation_min, raw_output));
880          output_data[Offset(output_dims, c, x, y, b)] =
881              static_cast<uint8>(clamped_output);
882        }
883      }
884    }
885  }
886}
887
888// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
889// dimensionality if the runtime code does a single loop over one dimension
890// that handles broadcasting as the base case. The code generator would then
891// generate max(D1, D2) nested for loops.
892template <FusedActivationFunctionType Ac>
893void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
894                  const float* input2_data, const Dims<4>& input2_dims,
895                  float* output_data, const Dims<4>& output_dims) {
896  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
897
898  NdArrayDesc<4> desc1;
899  NdArrayDesc<4> desc2;
900  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
901
902  // In Tensorflow, the dimensions are canonically named (batch_number, row,
903  // col, channel), with extents (batches, height, width, depth), with the
904  // trailing dimension changing most rapidly (channels has the smallest stride,
905  // typically 1 element).
906  //
907  // In generated C code, we store arrays with the dimensions reversed. The
908  // first dimension has smallest stride.
909  //
910  // We name our variables by their Tensorflow convention, but generate C code
911  // nesting loops such that the innermost loop has the smallest stride for the
912  // best cache behavior.
913  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
914    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
915      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
916        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
917          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
918              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
919              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
920        }
921      }
922    }
923  }
924}
925
926inline void BroadcastAdd(int left_shift, const uint8* input1_data,
927                         const Dims<4>& input1_dims, int32 input1_offset,
928                         int32 input1_multiplier, int input1_shift,
929                         const uint8* input2_data, const Dims<4>& input2_dims,
930                         int32 input2_offset, int32 input2_multiplier,
931                         int input2_shift, int32 output_offset,
932                         int32 output_multiplier, int output_shift,
933                         int32 output_activation_min,
934                         int32 output_activation_max, uint8* output_data,
935                         const Dims<4>& output_dims) {
936  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
937
938  NdArrayDesc<4> desc1;
939  NdArrayDesc<4> desc2;
940  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
941
942  // In Tensorflow, the dimensions are canonically named (batch_number, row,
943  // col, channel), with extents (batches, height, width, depth), with the
944  // trailing dimension changing most rapidly (channels has the smallest stride,
945  // typically 1 element).
946  //
947  // In generated C code, we store arrays with the dimensions reversed. The
948  // first dimension has smallest stride.
949  //
950  // We name our variables by their Tensorflow convention, but generate C code
951  // nesting loops such that the innermost loop has the smallest stride for the
952  // best cache behavior.
953  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
954    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
955      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
956        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
957          const int32 input1_val =
958              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
959          const int32 input2_val =
960              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
961          const int32 shifted_input1_val = input1_val * (1 << left_shift);
962          const int32 shifted_input2_val = input2_val * (1 << left_shift);
963          const int32 scaled_input1_val =
964              MultiplyByQuantizedMultiplierSmallerThanOne(
965                  shifted_input1_val, input1_multiplier, input1_shift);
966          const int32 scaled_input2_val =
967              MultiplyByQuantizedMultiplierSmallerThanOne(
968                  shifted_input2_val, input2_multiplier, input2_shift);
969          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
970          const int32 raw_output =
971              MultiplyByQuantizedMultiplierSmallerThanOne(
972                  raw_sum, output_multiplier, output_shift) +
973              output_offset;
974          const int32 clamped_output =
975              std::min(output_activation_max,
976                       std::max(output_activation_min, raw_output));
977          output_data[Offset(output_dims, c, x, y, b)] =
978              static_cast<uint8>(clamped_output);
979        }
980      }
981    }
982  }
983}
984
985template <FusedActivationFunctionType Ac>
986inline void BroadcastAdd(int left_shift, const uint8* input1_data,
987                         const Dims<4>& input1_dims, int32 input1_offset,
988                         int32 input1_multiplier, int input1_shift,
989                         const uint8* input2_data, const Dims<4>& input2_dims,
990                         int32 input2_offset, int32 input2_multiplier,
991                         int input2_shift, int32 output_offset,
992                         int32 output_multiplier, int output_shift,
993                         int32 output_activation_min,
994                         int32 output_activation_max, uint8* output_data,
995                         const Dims<4>& output_dims) {
996  static_assert(Ac == FusedActivationFunctionType::kNone ||
997                    Ac == FusedActivationFunctionType::kRelu ||
998                    Ac == FusedActivationFunctionType::kRelu6 ||
999                    Ac == FusedActivationFunctionType::kRelu1,
1000                "");
1001  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1002  if (Ac == FusedActivationFunctionType::kNone) {
1003    TFLITE_DCHECK_EQ(output_activation_min, 0);
1004    TFLITE_DCHECK_EQ(output_activation_max, 255);
1005  }
1006  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
1007               input1_multiplier, input1_shift, input2_data, input2_dims,
1008               input2_offset, input2_multiplier, input2_shift, output_offset,
1009               output_multiplier, output_shift, output_activation_min,
1010               output_activation_max, output_data, output_dims);
1011}
1012
1013inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
1014                const float* input2_data, const Dims<4>& input2_dims,
1015                float output_activation_min, float output_activation_max,
1016                float* output_data, const Dims<4>& output_dims) {
1017  const int batches =
1018      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
1019  const int height =
1020      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
1021  const int width =
1022      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
1023  const int depth =
1024      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
1025  for (int b = 0; b < batches; ++b) {
1026    for (int y = 0; y < height; ++y) {
1027      for (int x = 0; x < width; ++x) {
1028        for (int c = 0; c < depth; ++c) {
1029          output_data[Offset(output_dims, c, x, y, b)] =
1030              ActivationFunctionWithMinMax(
1031                  input1_data[Offset(input1_dims, c, x, y, b)] *
1032                      input2_data[Offset(input2_dims, c, x, y, b)],
1033                  output_activation_min, output_activation_max);
1034        }
1035      }
1036    }
1037  }
1038}
1039
1040// legacy, for compatibility with old checked-in code
1041template <FusedActivationFunctionType Ac>
1042void Mul(const float* input1_data, const Dims<4>& input1_dims,
1043         const float* input2_data, const Dims<4>& input2_dims,
1044         float* output_data, const Dims<4>& output_dims) {
1045  float output_activation_min, output_activation_max;
1046  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1047
1048  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
1049      output_activation_max, output_data, output_dims);
1050}
1051
1052// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
1053// dimensionality if the runtime code does a single loop over one dimension
1054// that handles broadcasting as the base case. The code generator would then
1055// generate max(D1, D2) nested for loops.
1056template <FusedActivationFunctionType Ac>
1057void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
1058                  const float* input2_data, const Dims<4>& input2_dims,
1059                  float* output_data, const Dims<4>& output_dims) {
1060  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
1061
1062  NdArrayDesc<4> desc1;
1063  NdArrayDesc<4> desc2;
1064  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
1065
1066  // In Tensorflow, the dimensions are canonically named (batch_number, row,
1067  // col, channel), with extents (batches, height, width, depth), with the
1068  // trailing dimension changing most rapidly (channels has the smallest
1069  // stride, typically 1 element).
1070  //
1071  // In generated C code, we store arrays with the dimensions reversed. The
1072  // first dimension has smallest stride.
1073  //
1074  // We name our variables by their Tensorflow convention, but generate C code
1075  // nesting loops such that the innermost loop has the smallest stride for
1076  // the best cache behavior.
1077  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
1078    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
1079      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
1080        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
1081          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
1082              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
1083              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
1084        }
1085      }
1086    }
1087  }
1088}
1089
1090inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
1091                         int32 input1_offset, const uint8* input2_data,
1092                         const Dims<4>& input2_dims, int32 input2_offset,
1093                         int32 output_offset, int32 output_multiplier,
1094                         int output_shift, int32 output_activation_min,
1095                         int32 output_activation_max, uint8* output_data,
1096                         const Dims<4>& output_dims) {
1097  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
1098
1099  NdArrayDesc<4> desc1;
1100  NdArrayDesc<4> desc2;
1101  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
1102
1103  // In Tensorflow, the dimensions are canonically named (batch_number, row,
1104  // col, channel), with extents (batches, height, width, depth), with the
1105  // trailing dimension changing most rapidly (channels has the smallest
1106  // stride, typically 1 element).
1107  //
1108  // In generated C code, we store arrays with the dimensions reversed. The
1109  // first dimension has smallest stride.
1110  //
1111  // We name our variables by their Tensorflow convention, but generate C code
1112  // nesting loops such that the innermost loop has the smallest stride for
1113  // the best cache behavior.
1114  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
1115    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
1116      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
1117        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
1118          const int32 input1_val =
1119              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
1120          const int32 input2_val =
1121              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
1122          const int32 unclamped_result =
1123              output_offset +
1124              MultiplyByQuantizedMultiplierSmallerThanOne(
1125                  input1_val * input2_val, output_multiplier, output_shift);
1126          const int32 clamped_output =
1127              std::min(output_activation_max,
1128                       std::max(output_activation_min, unclamped_result));
1129          output_data[Offset(output_dims, c, x, y, b)] =
1130              static_cast<uint8>(clamped_output);
1131        }
1132      }
1133    }
1134  }
1135}
1136
1137// legacy, for compatibility with old checked-in code
1138template <FusedActivationFunctionType Ac>
1139inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
1140                         int32 input1_offset, const uint8* input2_data,
1141                         const Dims<4>& input2_dims, int32 input2_offset,
1142                         int32 output_offset, int32 output_multiplier,
1143                         int output_shift, int32 output_activation_min,
1144                         int32 output_activation_max, uint8* output_data,
1145                         const Dims<4>& output_dims) {
1146  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
1147               input2_dims, input2_offset, output_offset, output_multiplier,
1148               output_shift, output_activation_min, output_activation_max,
1149               output_data, output_dims);
1150}
1151
1152inline void Div(const float* input1_data, const Dims<4>& input1_dims,
1153                const float* input2_data, const Dims<4>& input2_dims,
1154                float output_activation_min, float output_activation_max,
1155                float* output_data, const Dims<4>& output_dims) {
1156  const int batches =
1157      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
1158  const int height =
1159      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
1160  const int width =
1161      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
1162  const int depth =
1163      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
1164  for (int b = 0; b < batches; ++b) {
1165    for (int y = 0; y < height; ++y) {
1166      for (int x = 0; x < width; ++x) {
1167        for (int c = 0; c < depth; ++c) {
1168          output_data[Offset(output_dims, c, x, y, b)] =
1169              ActivationFunctionWithMinMax(
1170                  input1_data[Offset(input1_dims, c, x, y, b)] /
1171                      input2_data[Offset(input2_dims, c, x, y, b)],
1172                  output_activation_min, output_activation_max);
1173        }
1174      }
1175    }
1176  }
1177}
1178
1179inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
1180                const float* input2_data, const Dims<4>& input2_dims,
1181                float output_activation_min, float output_activation_max,
1182                float* output_data, const Dims<4>& output_dims) {
1183  const int batches =
1184      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
1185  const int height =
1186      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
1187  const int width =
1188      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
1189  const int depth =
1190      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
1191  for (int b = 0; b < batches; ++b) {
1192    for (int y = 0; y < height; ++y) {
1193      for (int x = 0; x < width; ++x) {
1194        for (int c = 0; c < depth; ++c) {
1195          output_data[Offset(output_dims, c, x, y, b)] =
1196              ActivationFunctionWithMinMax(
1197                  input1_data[Offset(input1_dims, c, x, y, b)] -
1198                      input2_data[Offset(input2_dims, c, x, y, b)],
1199                  output_activation_min, output_activation_max);
1200        }
1201      }
1202    }
1203  }
1204}
1205
1206template <FusedActivationFunctionType Ac, typename Scalar>
1207void Concatenation(int concat_dim, const Scalar* const* input_data,
1208                   const Dims<4>* const* input_dims, int inputs_count,
1209                   Scalar* output_data, const Dims<4>& output_dims) {
1210  TFLITE_DCHECK_GT(inputs_count, 1);
1211  int concat_size = 0;
1212  for (int i = 0; i < inputs_count; i++) {
1213    for (int j = 0; j < 4; j++) {
1214      if (j != concat_dim) {
1215        MatchingArraySize(*input_dims[i], j, output_dims, j);
1216      }
1217    }
1218    concat_size += ArraySize(*input_dims[i], concat_dim);
1219  }
1220  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
1221  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
1222  int outer_size = 1;
1223  for (int i = concat_dim + 1; i < 4; i++) {
1224    outer_size *= output_dims.sizes[i];
1225  }
1226  Scalar* output_ptr = output_data;
1227  for (int k = 0; k < outer_size; k++) {
1228    for (int i = 0; i < inputs_count; ++i) {
1229      const int copy_size =
1230          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
1231      memcpy(output_ptr, input_data[i] + k * copy_size,
1232             copy_size * sizeof(Scalar));
1233      output_ptr += copy_size;
1234    }
1235  }
1236}
1237
1238template <FusedActivationFunctionType Ac, typename Scalar>
1239void DepthConcatenation(const Scalar* const* input_data,
1240                        const Dims<4>* const* input_dims, int inputs_count,
1241                        Scalar* output_data, const Dims<4>& output_dims) {
1242  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
1243                            output_data, output_dims);
1244}
1245
1246inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
1247                     const float* prev_activ_data,
1248                     const Dims<4>& prev_activ_dims, const float* weights_data,
1249                     const Dims<4>& weights_dims, const float* bias_data,
1250                     const Dims<4>& bias_dims, const float* prev_state_data,
1251                     const Dims<4>& prev_state_dims, float* output_state_data,
1252                     const Dims<4>& output_state_dims, float* output_activ_data,
1253                     const Dims<4>& output_activ_dims, float* concat_temp_data,
1254                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
1255                     const Dims<4>& activ_temp_dims) {
1256  const int batches =
1257      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
1258                        output_state_dims, 3, output_activ_dims, 3);
1259  const int height =
1260      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
1261                        output_state_dims, 2, output_activ_dims, 2);
1262  const int width =
1263      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
1264                        output_state_dims, 1, output_activ_dims, 1);
1265  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
1266  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
1267  const int input_depth = ArraySize(input_dims, 0);
1268  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
1269  const int total_input_depth = prev_activ_depth + input_depth;
1270  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
1271  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
1272                  1);
1273  const int intern_activ_depth =
1274      MatchingArraySize(weights_dims, 1, bias_dims, 0);
1275  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
1276  const int output_depth =
1277      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
1278                        output_state_dims, 0, output_activ_dims, 0);
1279  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
1280
1281  // Concatenate prev_activ and input data together
1282  std::vector<float const*> concat_input_arrays_data;
1283  std::vector<Dims<4> const*> concat_input_arrays_dims;
1284  concat_input_arrays_data.push_back(input_data);
1285  concat_input_arrays_data.push_back(prev_activ_data);
1286  concat_input_arrays_dims.push_back(&input_dims);
1287  concat_input_arrays_dims.push_back(&prev_activ_dims);
1288  Concatenation<FusedActivationFunctionType::kNone, float>(
1289      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
1290      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
1291
1292  // Fully connected
1293  FullyConnected<FusedActivationFunctionType::kNone>(
1294      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
1295      bias_dims, activ_temp_data, activ_temp_dims);
1296
1297  // Memory state update (the LSTM "guts")
1298  for (int b = 0; b < batches; ++b) {
1299    for (int w = 0; w < width; ++w) {
1300      for (int h = 0; h < height; ++h) {
1301        for (int c = 0; c < output_depth; ++c) {
1302          const float input_gate =
1303              1.f /
1304              (1.f + std::exp(-activ_temp_data[Offset(
1305                         activ_temp_dims, 0 * output_depth + c, w, h, b)]));
1306          const float new_input = std::tanh(activ_temp_data[Offset(
1307              activ_temp_dims, 1 * output_depth + c, w, h, b)]);
1308          const float forget_gate =
1309              1.f /
1310              (1.f + std::exp(-activ_temp_data[Offset(
1311                         activ_temp_dims, 2 * output_depth + c, w, h, b)]));
1312          const float output_gate =
1313              1.f /
1314              (1.f + std::exp(-activ_temp_data[Offset(
1315                         activ_temp_dims, 3 * output_depth + c, w, h, b)]));
1316          const float new_state =
1317              input_gate * new_input +
1318              forget_gate *
1319                  prev_state_data[Offset(prev_state_dims, c, w, h, b)];
1320          output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state;
1321          output_activ_data[Offset(output_activ_dims, c, w, h, b)] =
1322              output_gate * std::tanh(new_state);
1323        }
1324      }
1325    }
1326  }
1327}
1328
1329template <FusedActivationFunctionType Ac, typename Scalar>
1330void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
1331                     int outputs_count, Scalar* const* output_data,
1332                     const Dims<4>* const* output_dims) {
1333  TFLITE_DCHECK_GE(outputs_count, 1);
1334  for (int i = 0; i < outputs_count; i++) {
1335    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
1336    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
1337    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
1338  }
1339  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
1340  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
1341  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
1342  // for now we dont have a model with a TensorFlowSplit
1343  // with fused activation function.
1344  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
1345  for (int b = 0; b < batches; ++b) {
1346    for (int y = 0; y < height; ++y) {
1347      for (int x = 0; x < width; ++x) {
1348        int in_c = 0;
1349        for (int i = 0; i < outputs_count; ++i) {
1350          const int depth = ArraySize(*output_dims[i], 0);
1351          for (int c = 0; c < depth; ++c) {
1352            output_data[i][Offset(*output_dims[i], c, x, y, b)] =
1353                input_data[Offset(input_dims, in_c, x, y, b)];
1354            in_c++;
1355          }
1356        }
1357        TFLITE_DCHECK(in_c == ArraySize(input_dims, 0));
1358      }
1359    }
1360  }
1361}
1362
1363// TODO(benoitjacob) make this a proper reference impl without Eigen!
1364template <typename Scalar>
1365using MatrixMap = typename std::conditional<
1366    std::is_const<Scalar>::value,
1367    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
1368                                   Eigen::Dynamic, Eigen::Dynamic>>,
1369    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
1370
1371template <typename Scalar, int N>
1372MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
1373                                                const Dims<N>& dims) {
1374  const int rows = dims.sizes[0];
1375  int cols = 1;
1376  for (int d = 1; d < N; d++) {
1377    cols *= dims.sizes[d];
1378  }
1379  return MatrixMap<Scalar>(data, rows, cols);
1380}
1381
1382template <typename Scalar, int N>
1383MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
1384                                               const Dims<N>& dims) {
1385  const int cols = dims.sizes[N - 1];
1386  int rows = 1;
1387  for (int d = 0; d < N - 1; d++) {
1388    rows *= dims.sizes[d];
1389  }
1390  return MatrixMap<Scalar>(data, rows, cols);
1391}
1392
1393inline int NodeOffset(int b, int h, int w, int height, int width) {
1394  return (b * height + h) * width + w;
1395}
1396
1397inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
1398                        int stride_width, int stride_height, int pad_width,
1399                        int pad_height, int filter_width, int filter_height,
1400                        float output_activation_min,
1401                        float output_activation_max, float* output_data,
1402                        const Dims<4>& output_dims) {
1403  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1404  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1405  const int input_height = ArraySize(input_dims, 2);
1406  const int input_width = ArraySize(input_dims, 1);
1407  const int output_height = ArraySize(output_dims, 2);
1408  const int output_width = ArraySize(output_dims, 1);
1409  for (int batch = 0; batch < batches; ++batch) {
1410    for (int out_y = 0; out_y < output_height; ++out_y) {
1411      for (int out_x = 0; out_x < output_width; ++out_x) {
1412        for (int channel = 0; channel < depth; ++channel) {
1413          const int in_x_origin = (out_x * stride_width) - pad_width;
1414          const int in_y_origin = (out_y * stride_height) - pad_height;
1415          // Compute the boundaries of the filter region clamped so as to
1416          // ensure that the filter window fits in the input array.
1417          const int filter_x_start = std::max(0, -in_x_origin);
1418          const int filter_x_end =
1419              std::min(filter_width, input_width - in_x_origin);
1420          const int filter_y_start = std::max(0, -in_y_origin);
1421          const int filter_y_end =
1422              std::min(filter_height, input_height - in_y_origin);
1423          float total = 0.f;
1424          float filter_count = 0;
1425          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1426               ++filter_y) {
1427            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1428                 ++filter_x) {
1429              const int in_x = in_x_origin + filter_x;
1430              const int in_y = in_y_origin + filter_y;
1431              total +=
1432                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1433              filter_count++;
1434            }
1435          }
1436          const float average = total / filter_count;
1437          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1438              ActivationFunctionWithMinMax(average, output_activation_min,
1439                                           output_activation_max);
1440        }
1441      }
1442    }
1443  }
1444}
1445
1446// legacy, for compatibility with old checked-in code
1447template <FusedActivationFunctionType Ac>
1448void AveragePool(const float* input_data, const Dims<4>& input_dims,
1449                 int stride_width, int stride_height, int pad_width,
1450                 int pad_height, int filter_width, int filter_height,
1451                 float* output_data, const Dims<4>& output_dims) {
1452  float output_activation_min, output_activation_max;
1453  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1454  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
1455              pad_height, filter_width, filter_height, output_activation_min,
1456              output_activation_max, output_data, output_dims);
1457}
1458
1459// legacy, for compatibility with old checked-in code
1460template <FusedActivationFunctionType Ac>
1461void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
1462                 int pad_width, int pad_height, int filter_width,
1463                 int filter_height, float* output_data,
1464                 const Dims<4>& output_dims) {
1465  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1466                  filter_width, filter_height, output_data, output_dims);
1467}
1468
1469inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
1470                        int stride_width, int stride_height, int pad_width,
1471                        int pad_height, int filter_width, int filter_height,
1472                        int32 output_activation_min,
1473                        int32 output_activation_max, uint8* output_data,
1474                        const Dims<4>& output_dims) {
1475  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1476  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1477  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1478  const int input_height = ArraySize(input_dims, 2);
1479  const int input_width = ArraySize(input_dims, 1);
1480  const int output_height = ArraySize(output_dims, 2);
1481  const int output_width = ArraySize(output_dims, 1);
1482  for (int batch = 0; batch < batches; ++batch) {
1483    for (int out_y = 0; out_y < output_height; ++out_y) {
1484      for (int out_x = 0; out_x < output_width; ++out_x) {
1485        for (int channel = 0; channel < depth; ++channel) {
1486          const int in_x_origin = (out_x * stride_width) - pad_width;
1487          const int in_y_origin = (out_y * stride_height) - pad_height;
1488          // Compute the boundaries of the filter region clamped so as to
1489          // ensure that the filter window fits in the input array.
1490          const int filter_x_start = std::max(0, -in_x_origin);
1491          const int filter_x_end =
1492              std::min(filter_width, input_width - in_x_origin);
1493          const int filter_y_start = std::max(0, -in_y_origin);
1494          const int filter_y_end =
1495              std::min(filter_height, input_height - in_y_origin);
1496          int32 acc = 0;
1497          int filter_count = 0;
1498          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1499               ++filter_y) {
1500            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1501                 ++filter_x) {
1502              const int in_x = in_x_origin + filter_x;
1503              const int in_y = in_y_origin + filter_y;
1504              acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1505              filter_count++;
1506            }
1507          }
1508          acc = (acc + filter_count / 2) / filter_count;
1509          acc = std::max(acc, output_activation_min);
1510          acc = std::min(acc, output_activation_max);
1511          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1512              static_cast<uint8>(acc);
1513        }
1514      }
1515    }
1516  }
1517}
1518
1519// legacy, for compatibility with old checked-in code
1520template <FusedActivationFunctionType Ac>
1521void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
1522                 int stride_width, int stride_height, int pad_width,
1523                 int pad_height, int filter_width, int filter_height,
1524                 int32 output_activation_min, int32 output_activation_max,
1525                 uint8* output_data, const Dims<4>& output_dims) {
1526  static_assert(Ac == FusedActivationFunctionType::kNone ||
1527                    Ac == FusedActivationFunctionType::kRelu ||
1528                    Ac == FusedActivationFunctionType::kRelu6 ||
1529                    Ac == FusedActivationFunctionType::kRelu1,
1530                "");
1531  if (Ac == FusedActivationFunctionType::kNone) {
1532    TFLITE_DCHECK_EQ(output_activation_min, 0);
1533    TFLITE_DCHECK_EQ(output_activation_max, 255);
1534  }
1535  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
1536              pad_height, filter_width, filter_height, output_activation_min,
1537              output_activation_max, output_data, output_dims);
1538}
1539
1540// legacy, for compatibility with old checked-in code
1541template <FusedActivationFunctionType Ac>
1542void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
1543                 int pad_width, int pad_height, int filter_width,
1544                 int filter_height, int32 output_activation_min,
1545                 int32 output_activation_max, uint8* output_data,
1546                 const Dims<4>& output_dims) {
1547  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1548                  filter_width, filter_height, output_activation_min,
1549                  output_activation_max, output_data, output_dims);
1550}
1551
1552inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
1553                   int stride_width, int stride_height, int pad_width,
1554                   int pad_height, int filter_width, int filter_height,
1555                   float output_activation_min, float output_activation_max,
1556                   float* output_data, const Dims<4>& output_dims) {
1557  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1558  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1559  const int input_height = ArraySize(input_dims, 2);
1560  const int input_width = ArraySize(input_dims, 1);
1561  const int output_height = ArraySize(output_dims, 2);
1562  const int output_width = ArraySize(output_dims, 1);
1563  for (int batch = 0; batch < batches; ++batch) {
1564    for (int out_y = 0; out_y < output_height; ++out_y) {
1565      for (int out_x = 0; out_x < output_width; ++out_x) {
1566        for (int channel = 0; channel < depth; ++channel) {
1567          const int in_x_origin = (out_x * stride_width) - pad_width;
1568          const int in_y_origin = (out_y * stride_height) - pad_height;
1569          // Compute the boundaries of the filter region clamped so as to
1570          // ensure that the filter window fits in the input array.
1571          const int filter_x_start = std::max(0, -in_x_origin);
1572          const int filter_x_end =
1573              std::min(filter_width, input_width - in_x_origin);
1574          const int filter_y_start = std::max(0, -in_y_origin);
1575          const int filter_y_end =
1576              std::min(filter_height, input_height - in_y_origin);
1577          float sum_squares = 0.f;
1578          int filter_count = 0;
1579          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1580               ++filter_y) {
1581            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1582                 ++filter_x) {
1583              const int in_x = in_x_origin + filter_x;
1584              const int in_y = in_y_origin + filter_y;
1585              const float val =
1586                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1587              sum_squares += val * val;
1588              filter_count++;
1589            }
1590          }
1591          const float l2pool_result = std::sqrt(sum_squares / filter_count);
1592          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1593              ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
1594                                           output_activation_max);
1595        }
1596      }
1597    }
1598  }
1599}
1600
1601// legacy, for compatibility with old checked-in code
1602template <FusedActivationFunctionType Ac>
1603void L2Pool(const float* input_data, const Dims<4>& input_dims,
1604            int stride_width, int stride_height, int pad_width, int pad_height,
1605            int filter_width, int filter_height, float* output_data,
1606            const Dims<4>& output_dims) {
1607  float output_activation_min, output_activation_max;
1608  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1609
1610  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
1611         pad_height, filter_width, filter_height, output_activation_min,
1612         output_activation_max, output_data, output_dims);
1613}
1614
1615// legacy, for compatibility with old checked-in code
1616template <FusedActivationFunctionType Ac>
1617void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
1618            int pad_width, int pad_height, int filter_width, int filter_height,
1619            float* output_data, const Dims<4>& output_dims) {
1620  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1621             filter_width, filter_height, output_data, output_dims);
1622}
1623
1624inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
1625                    int stride_width, int stride_height, int pad_width,
1626                    int pad_height, int filter_width, int filter_height,
1627                    float output_activation_min, float output_activation_max,
1628                    float* output_data, const Dims<4>& output_dims) {
1629  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1630  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1631  const int input_height = ArraySize(input_dims, 2);
1632  const int input_width = ArraySize(input_dims, 1);
1633  const int output_height = ArraySize(output_dims, 2);
1634  const int output_width = ArraySize(output_dims, 1);
1635  for (int batch = 0; batch < batches; ++batch) {
1636    for (int out_y = 0; out_y < output_height; ++out_y) {
1637      for (int out_x = 0; out_x < output_width; ++out_x) {
1638        for (int channel = 0; channel < depth; ++channel) {
1639          const int in_x_origin = (out_x * stride_width) - pad_width;
1640          const int in_y_origin = (out_y * stride_height) - pad_height;
1641          // Compute the boundaries of the filter region clamped so as to
1642          // ensure that the filter window fits in the input array.
1643          const int filter_x_start = std::max(0, -in_x_origin);
1644          const int filter_x_end =
1645              std::min(filter_width, input_width - in_x_origin);
1646          const int filter_y_start = std::max(0, -in_y_origin);
1647          const int filter_y_end =
1648              std::min(filter_height, input_height - in_y_origin);
1649          float max = std::numeric_limits<float>::lowest();
1650          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1651               ++filter_y) {
1652            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1653                 ++filter_x) {
1654              const int in_x = in_x_origin + filter_x;
1655              const int in_y = in_y_origin + filter_y;
1656              max = std::max(
1657                  max,
1658                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
1659            }
1660          }
1661          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1662              ActivationFunctionWithMinMax(max, output_activation_min,
1663                                           output_activation_max);
1664        }
1665      }
1666    }
1667  }
1668}
1669
1670// legacy, for compatibility with old checked-in code
1671template <FusedActivationFunctionType Ac>
1672void MaxPool(const float* input_data, const Dims<4>& input_dims,
1673             int stride_width, int stride_height, int pad_width, int pad_height,
1674             int filter_width, int filter_height, float* output_data,
1675             const Dims<4>& output_dims) {
1676  float output_activation_min, output_activation_max;
1677  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1678  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
1679          pad_height, filter_width, filter_height, output_activation_min,
1680          output_activation_max, output_data, output_dims);
1681}
1682
1683// legacy, for compatibility with old checked-in code
1684template <FusedActivationFunctionType Ac>
1685void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
1686             int pad_width, int pad_height, int filter_width, int filter_height,
1687             float* output_data, const Dims<4>& output_dims) {
1688  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1689              filter_width, filter_height, output_data, output_dims);
1690}
1691
1692inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
1693                    int stride_width, int stride_height, int pad_width,
1694                    int pad_height, int filter_width, int filter_height,
1695                    int32 output_activation_min, int32 output_activation_max,
1696                    uint8* output_data, const Dims<4>& output_dims) {
1697  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1698  TFLITE_DCHECK_GE(output_activation_min, 0);
1699  TFLITE_DCHECK_LE(output_activation_max, 255);
1700  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1701  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1702  const int input_height = ArraySize(input_dims, 2);
1703  const int input_width = ArraySize(input_dims, 1);
1704  const int output_height = ArraySize(output_dims, 2);
1705  const int output_width = ArraySize(output_dims, 1);
1706  for (int batch = 0; batch < batches; ++batch) {
1707    for (int out_y = 0; out_y < output_height; ++out_y) {
1708      for (int out_x = 0; out_x < output_width; ++out_x) {
1709        for (int channel = 0; channel < depth; ++channel) {
1710          const int in_x_origin = (out_x * stride_width) - pad_width;
1711          const int in_y_origin = (out_y * stride_height) - pad_height;
1712          // Compute the boundaries of the filter region clamped so as to
1713          // ensure that the filter window fits in the input array.
1714          const int filter_x_start = std::max(0, -in_x_origin);
1715          const int filter_x_end =
1716              std::min(filter_width, input_width - in_x_origin);
1717          const int filter_y_start = std::max(0, -in_y_origin);
1718          const int filter_y_end =
1719              std::min(filter_height, input_height - in_y_origin);
1720          uint8 max = 0;
1721          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1722               ++filter_y) {
1723            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1724                 ++filter_x) {
1725              const int in_x = in_x_origin + filter_x;
1726              const int in_y = in_y_origin + filter_y;
1727              max = std::max(
1728                  max,
1729                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
1730            }
1731          }
1732          max = std::max<uint8>(max, output_activation_min);
1733          max = std::min<uint8>(max, output_activation_max);
1734          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1735              static_cast<uint8>(max);
1736        }
1737      }
1738    }
1739  }
1740}
1741
1742// legacy, for compatibility with old checked-in code
1743template <FusedActivationFunctionType Ac>
1744void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
1745             int stride_width, int stride_height, int pad_width, int pad_height,
1746             int filter_width, int filter_height, int32 output_activation_min,
1747             int32 output_activation_max, uint8* output_data,
1748             const Dims<4>& output_dims) {
1749  static_assert(Ac == FusedActivationFunctionType::kNone ||
1750                    Ac == FusedActivationFunctionType::kRelu ||
1751                    Ac == FusedActivationFunctionType::kRelu6 ||
1752                    Ac == FusedActivationFunctionType::kRelu1,
1753                "");
1754  if (Ac == FusedActivationFunctionType::kNone) {
1755    TFLITE_DCHECK_EQ(output_activation_min, 0);
1756    TFLITE_DCHECK_EQ(output_activation_max, 255);
1757  }
1758  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
1759          pad_height, filter_width, filter_height, output_activation_min,
1760          output_activation_max, output_data, output_dims);
1761}
1762
1763// legacy, for compatibility with old checked-in code
1764template <FusedActivationFunctionType Ac>
1765void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
1766             int pad_width, int pad_height, int filter_width, int filter_height,
1767             int32 output_activation_min, int32 output_activation_max,
1768             uint8* output_data, const Dims<4>& output_dims) {
1769  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1770              filter_width, filter_height, output_activation_min,
1771              output_activation_max, output_data, output_dims);
1772}
1773
1774inline void LocalResponseNormalization(const float* input_data,
1775                                       const Dims<4>& input_dims, int range,
1776                                       float bias, float alpha, float beta,
1777                                       float* output_data,
1778                                       const Dims<4>& output_dims) {
1779  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1780  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1781  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1782  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1783
1784  for (int b = 0; b < batches; ++b) {
1785    for (int y = 0; y < height; ++y) {
1786      for (int x = 0; x < width; ++x) {
1787        for (int c = 0; c < depth; ++c) {
1788          const int begin_input_c = std::max(0, c - range);
1789          const int end_input_c = std::min(depth, c + range);
1790          float accum = 0.f;
1791          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
1792            const float input_val =
1793                input_data[Offset(input_dims, input_c, x, y, b)];
1794            accum += input_val * input_val;
1795          }
1796          const float multiplier = std::pow(bias + alpha * accum, -beta);
1797          output_data[Offset(output_dims, c, x, y, b)] =
1798              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
1799        }
1800      }
1801    }
1802  }
1803}
1804
1805inline void Softmax(const float* input_data, const Dims<4>& input_dims,
1806                    float beta, float* output_data,
1807                    const Dims<4>& output_dims) {
1808  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1809  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1810  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1811  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1812
1813  for (int b = 0; b < batches; ++b) {
1814    for (int y = 0; y < height; ++y) {
1815      for (int x = 0; x < width; ++x) {
1816        // Find max element value which we'll use to ensure numerical stability
1817        // taking advantage of the following equality:
1818        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
1819        float max = std::numeric_limits<float>::lowest();
1820        for (int c = 0; c < depth; ++c) {
1821          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
1822        }
1823
1824        // Compute sum.
1825        float sum = 0.f;
1826        for (int c = 0; c < depth; ++c) {
1827          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
1828                          beta);
1829        }
1830
1831        // Compute result.
1832        for (int c = 0; c < depth; ++c) {
1833          output_data[Offset(output_dims, c, x, y, b)] =
1834              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
1835                       beta) /
1836              sum;
1837        }
1838      }
1839    }
1840  }
1841}
1842
1843inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
1844                    int32 input_beta_multiplier, int32 input_beta_left_shift,
1845                    int diff_min, uint8* output_data,
1846                    const Dims<4>& output_dims) {
1847  // The representation chosen for the input to the exp() function is Q5.26.
1848  // We need to leave extra space since values that we skip might be as large as
1849  // -32 before multiplying by input_beta_multiplier, and therefore as large as
1850  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
1851  // accumulation, but exp(-16) definitely is.
1852  static const int kScaledDiffIntegerBits = 5;
1853  static const int kAccumulationIntegerBits = 12;
1854  using FixedPointScaledDiff =
1855      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
1856  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
1857  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
1858
1859  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1860  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1861  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1862  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1863
1864  for (int b = 0; b < batches; ++b) {
1865    for (int x = 0; x < width; ++x) {
1866      for (int y = 0; y < height; ++y) {
1867        uint8 max_in_row = 0;
1868        for (int c = 0; c < depth; ++c) {
1869          max_in_row =
1870              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
1871        }
1872
1873        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
1874        for (int c = 0; c < depth; ++c) {
1875          int32 input_diff =
1876              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
1877              max_in_row;
1878          if (input_diff >= diff_min) {
1879            const int32 input_diff_rescaled =
1880                MultiplyByQuantizedMultiplierGreaterThanOne(
1881                    input_diff, input_beta_multiplier, input_beta_left_shift);
1882            const FixedPointScaledDiff scaled_diff_f8 =
1883                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
1884            sum_of_exps =
1885                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
1886                                  exp_on_negative_values(scaled_diff_f8));
1887          }
1888        }
1889
1890        int32 fixed_sum_of_exps = sum_of_exps.raw();
1891        int headroom_plus_one =
1892            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
1893        // This is the number of bits to the left of the binary point above 1.0.
1894        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
1895        // no later adjustment will be needed.
1896        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
1897        int32 shifted_sum_minus_one = static_cast<int32>(
1898            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
1899            (static_cast<uint32>(1) << 31));
1900
1901        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
1902            FixedPoint0::FromRaw(shifted_sum_minus_one));
1903
1904        for (int c = 0; c < depth; ++c) {
1905          int32 input_diff =
1906              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
1907              max_in_row;
1908          if (input_diff >= diff_min) {
1909            const int32 input_diff_rescaled =
1910                MultiplyByQuantizedMultiplierGreaterThanOne(
1911                    input_diff, input_beta_multiplier, input_beta_left_shift);
1912            const FixedPointScaledDiff scaled_diff_f8 =
1913                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
1914
1915            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
1916            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
1917                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
1918
1919            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
1920                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
1921
1922          } else {
1923            output_data[Offset(output_dims, c, x, y, b)] = 0;
1924          }
1925        }
1926      }
1927    }
1928  }
1929}
1930
1931inline void Logistic(const float* input_data, const Dims<4>& input_dims,
1932                     float* output_data, const Dims<4>& output_dims) {
1933  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1934  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1935  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1936  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1937  for (int b = 0; b < batches; ++b) {
1938    for (int y = 0; y < height; ++y) {
1939      for (int x = 0; x < width; ++x) {
1940        for (int c = 0; c < depth; ++c) {
1941          float val = input_data[Offset(input_dims, c, x, y, b)];
1942          float result = 1.f / (1.f + std::exp(-val));
1943          output_data[Offset(output_dims, c, x, y, b)] = result;
1944        }
1945      }
1946    }
1947  }
1948}
1949
1950inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
1951                     int32 input_zero_point, int32 input_range_radius,
1952                     int32 input_multiplier, int input_left_shift,
1953                     uint8* output_data, const Dims<4>& output_dims) {
1954  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1955  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1956  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1957  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1958  for (int b = 0; b < batches; ++b) {
1959    for (int y = 0; y < height; ++y) {
1960      for (int x = 0; x < width; ++x) {
1961        for (int c = 0; c < depth; ++c) {
1962          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
1963          const int32 input_val_centered =
1964              static_cast<int32>(input_val_u8) - input_zero_point;
1965          uint8 output_val;
1966          if (input_val_centered <= -input_range_radius) {
1967            output_val = 0;
1968          } else if (input_val_centered >= input_range_radius) {
1969            output_val = 255;
1970          } else {
1971            const int32 input_val_rescaled =
1972                MultiplyByQuantizedMultiplierGreaterThanOne(
1973                    input_val_centered, input_multiplier, input_left_shift);
1974            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
1975            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
1976            const FixedPoint4 input_val_f4 =
1977                FixedPoint4::FromRaw(input_val_rescaled);
1978            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
1979            using gemmlowp::RoundingDivideByPOT;
1980            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
1981            if (output_val_s32 == 256) {
1982              output_val_s32 = 255;
1983            }
1984            TFLITE_DCHECK_GE(output_val_s32, 0);
1985            TFLITE_DCHECK_LE(output_val_s32, 255);
1986            output_val = static_cast<uint8>(output_val_s32);
1987          }
1988          output_data[Offset(output_dims, c, x, y, b)] = output_val;
1989        }
1990      }
1991    }
1992  }
1993}
1994
1995inline void Tanh(const float* input_data, const Dims<4>& input_dims,
1996                 float* output_data, const Dims<4>& output_dims) {
1997  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1998  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1999  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2000  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2001  for (int b = 0; b < batches; ++b) {
2002    for (int y = 0; y < height; ++y) {
2003      for (int x = 0; x < width; ++x) {
2004        for (int c = 0; c < depth; ++c) {
2005          float val = input_data[Offset(input_dims, c, x, y, b)];
2006          float result = std::tanh(val);
2007          output_data[Offset(output_dims, c, x, y, b)] = result;
2008        }
2009      }
2010    }
2011  }
2012}
2013
2014inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
2015                       int32 zero_point, double scale, float* output_data,
2016                       const Dims<4>& output_dims) {
2017  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2018  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2019  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2020  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2021  for (int b = 0; b < batches; ++b) {
2022    for (int y = 0; y < height; ++y) {
2023      for (int x = 0; x < width; ++x) {
2024        for (int c = 0; c < depth; ++c) {
2025          int32 val = input_data[Offset(input_dims, c, x, y, b)];
2026          float result = static_cast<float>(scale * (val - zero_point));
2027          output_data[Offset(output_dims, c, x, y, b)] = result;
2028        }
2029      }
2030    }
2031  }
2032}
2033
2034inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
2035                      float rmin, float rmax, float* output_data,
2036                      const Dims<4>& output_dims) {
2037  // 0 should always be a representable value. Let's assume that the initial
2038  // min,max range contains 0.
2039  TFLITE_DCHECK_LE(rmin, 0.);
2040  TFLITE_DCHECK_GE(rmax, 0.);
2041
2042  // Determine quantization parameters: zero_point, scale.
2043  using Integer = uint8;
2044  const Integer qmin = std::numeric_limits<Integer>::min();
2045  const Integer qmax = std::numeric_limits<Integer>::max();
2046  const float qmin_float = qmin;
2047  const float qmax_float = qmax;
2048  int32 zero_point = 0;
2049  float scale = 0.f;
2050  // If rmin==rmax, both must be zero per the above assertion,
2051  // so we are done.
2052  if (rmin != rmax) {
2053    // First determine the scale.
2054    scale = (rmax - rmin) / (qmax_float - qmin_float);
2055
2056    // Zero-point computation.
2057    // First the initial floating-point computation. The zero-point can be
2058    // determined from solving an affine equation for any known pair
2059    // (real value, corresponding quantized value).
2060    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
2061    // The arithmetic error on the zero point computed from either pair
2062    // will be roughly machine_epsilon * (sum of absolute values of terms)
2063    // so we want to use the variant that adds the smaller terms.
2064    const float zero_point_from_min = qmin_float - rmin / scale;
2065    const float zero_point_from_max = qmax_float - rmax / scale;
2066    const float zero_point_from_min_error =
2067        std::abs(qmin_float) + std::abs(rmin / scale);
2068    const float zero_point_from_max_error =
2069        std::abs(qmax_float) + std::abs(rmax / scale);
2070
2071    const float zero_point_float =
2072        zero_point_from_min_error < zero_point_from_max_error
2073            ? zero_point_from_min
2074            : zero_point_from_max;
2075
2076    // Now we need to nudge the zero point to be an integer
2077    // (our zero points are integer, and this is motivated by the requirement
2078    // to be able to represent the real value "0" exactly as a quantized value,
2079    // which is required in multiple places, for example in Im2col with SAME
2080    // padding).
2081    if (zero_point_float < qmin_float) {
2082      zero_point = qmin;
2083    } else if (zero_point_float > qmax_float) {
2084      zero_point = qmax;
2085    } else {
2086      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
2087    }
2088    // The zero point should always be in the range of quantized value,
2089    // [qmin, qmax].
2090    TFLITE_DCHECK_GE(zero_point, qmin);
2091    TFLITE_DCHECK_LE(zero_point, qmax);
2092  }
2093
2094  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2095  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2096  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2097  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2098  for (int b = 0; b < batches; ++b) {
2099    for (int y = 0; y < height; ++y) {
2100      for (int x = 0; x < width; ++x) {
2101        for (int c = 0; c < depth; ++c) {
2102          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
2103          const float unclamped_quantized_val =
2104              TfLiteRound(zero_point + src_val / scale);
2105          const float quantized_val = std::min(
2106              qmax_float, std::max(qmin_float, unclamped_quantized_val));
2107          const float dst_val = scale * (quantized_val - zero_point);
2108          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
2109        }
2110      }
2111    }
2112  }
2113}
2114
2115template <typename SrcT, typename DstT>
2116inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
2117                 DstT* output_data, const Dims<4>& output_dims) {
2118  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2119  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2120  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2121  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2122  for (int b = 0; b < batches; ++b) {
2123    for (int y = 0; y < height; ++y) {
2124      for (int x = 0; x < width; ++x) {
2125        for (int c = 0; c < depth; ++c) {
2126          int offset = Offset(input_dims, c, x, y, b);
2127          output_data[offset] = static_cast<DstT>(input_data[offset]);
2128        }
2129      }
2130    }
2131  }
2132}
2133
2134inline void Floor(const float* input_data, const Dims<4>& input_dims,
2135                  float* output_data, const Dims<4>& output_dims) {
2136  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2137  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2138  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2139  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2140  for (int b = 0; b < batches; ++b) {
2141    for (int y = 0; y < height; ++y) {
2142      for (int x = 0; x < width; ++x) {
2143        for (int c = 0; c < depth; ++c) {
2144          int offset = Offset(input_dims, c, x, y, b);
2145          output_data[offset] = std::floor(input_data[offset]);
2146        }
2147      }
2148    }
2149  }
2150}
2151
2152template <typename T>
2153inline void Gather(const T* input_data, const Dims<4>& input_dims,
2154                   int input_rank, const int32* coords_data,
2155                   const Dims<4>& coords_dims, T* output_data,
2156                   const Dims<4>& output_dims) {
2157  TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
2158  int stride = input_dims.strides[input_rank - 1];
2159  T* out = output_data;
2160
2161  for (int i = 0; i < coords_dims.sizes[0]; i++) {
2162    TFLITE_DCHECK_GE(coords_data[i], 0);
2163    TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
2164    const T* in = input_data + coords_data[i] * stride;
2165    memcpy(out, in, sizeof(T) * stride);
2166    out += stride;
2167  }
2168}
2169
2170inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
2171                           const int32* output_size_data,
2172                           const Dims<4>& output_size_dims, float* output_data,
2173                           const Dims<4>& output_dims) {
2174  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2175  int32 input_height = ArraySize(input_dims, 2);
2176  int32 input_width = ArraySize(input_dims, 1);
2177  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2178
2179  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
2180  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
2181  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
2182  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
2183  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
2184  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
2185  float height_scale = static_cast<float>(input_height) / output_height;
2186  float width_scale = static_cast<float>(input_width) / output_width;
2187
2188  for (int b = 0; b < batches; ++b) {
2189    for (int y = 0; y < output_height; ++y) {
2190      float input_y = y * height_scale;
2191      int32 y0 = static_cast<int32>(std::floor(input_y));
2192      int32 y1 = std::min(y0 + 1, input_height - 1);
2193      for (int x = 0; x < output_width; ++x) {
2194        float input_x = x * width_scale;
2195        int32 x0 = static_cast<int32>(std::floor(input_x));
2196        int32 x1 = std::min(x0 + 1, input_width - 1);
2197        for (int c = 0; c < depth; ++c) {
2198          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
2199                                    (1 - (input_y - y0)) *
2200                                    (1 - (input_x - x0)) +
2201                                input_data[Offset(input_dims, c, x0, y1, b)] *
2202                                    (input_y - y0) * (1 - (input_x - x0)) +
2203                                input_data[Offset(input_dims, c, x1, y0, b)] *
2204                                    (1 - (input_y - y0)) * (input_x - x0) +
2205                                input_data[Offset(input_dims, c, x1, y1, b)] *
2206                                    (input_y - y0) * (input_x - x0);
2207          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
2208        }
2209      }
2210    }
2211  }
2212}
2213
2214template <typename T>
2215inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
2216                           const int32* block_shape_data,
2217                           const Dims<4>& block_shape_dims,
2218                           const int32* paddings_data,
2219                           const Dims<4>& paddings_dims, T* output_data,
2220                           const Dims<4>& output_dims) {
2221  const int output_batch_size = ArraySize(output_dims, 3);
2222  const int output_height = ArraySize(output_dims, 2);
2223  const int output_width = ArraySize(output_dims, 1);
2224  const int input_batch_size = ArraySize(input_dims, 3);
2225  const int input_height = ArraySize(input_dims, 2);
2226  const int input_width = ArraySize(input_dims, 1);
2227  const int depth = ArraySize(input_dims, 0);
2228  const int block_shape_height = block_shape_data[0];
2229  const int block_shape_width = block_shape_data[1];
2230  const int padding_top = paddings_data[0];
2231  const int padding_left = paddings_data[2];
2232
2233  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
2234    int input_batch = out_b % input_batch_size;
2235    int shift_w = (out_b / input_batch_size) % block_shape_width;
2236    int shift_h = (out_b / input_batch_size) / block_shape_width;
2237    for (int out_h = 0; out_h < output_height; ++out_h) {
2238      for (int out_w = 0; out_w < output_width; ++out_w) {
2239        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
2240        if (out_h * block_shape_height + shift_h < padding_top ||
2241            out_h * block_shape_height + shift_h >=
2242                padding_top + input_height ||
2243            out_w * block_shape_width + shift_w < padding_left ||
2244            out_w * block_shape_width + shift_w >= padding_left + input_width) {
2245          memset(out, 0, depth * sizeof(T));
2246        } else {
2247          const T* in =
2248              input_data +
2249              Offset(input_dims, 0,
2250                     (out_w * block_shape_width + shift_w) - padding_left,
2251                     (out_h * block_shape_height + shift_h) - padding_top,
2252                     input_batch);
2253          memcpy(out, in, depth * sizeof(T));
2254        }
2255      }
2256    }
2257  }
2258}
2259
2260template <typename T>
2261inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
2262                           const int32* block_shape_data,
2263                           const Dims<4>& block_shape_dims, T* output_data,
2264                           const Dims<4>& output_dims) {
2265  const int output_batch_size = ArraySize(output_dims, 3);
2266  const int input_batch_size = ArraySize(input_dims, 3);
2267  const int input_height = ArraySize(input_dims, 2);
2268  const int input_width = ArraySize(input_dims, 1);
2269  const int depth = ArraySize(input_dims, 0);
2270  const int block_shape_width = block_shape_data[1];
2271  const int block_shape_height = block_shape_data[0];
2272
2273  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
2274    for (int in_h = 0; in_h < input_height; ++in_h) {
2275      for (int in_w = 0; in_w < input_width; ++in_w) {
2276        int out_batch = in_batch % output_batch_size;
2277        int out_w = in_w * block_shape_width +
2278                    (in_batch / output_batch_size) % block_shape_width;
2279        int out_h = in_h * block_shape_height +
2280                    (in_batch / output_batch_size) / block_shape_width;
2281        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
2282        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
2283        memcpy(out, in, depth * sizeof(T));
2284      }
2285    }
2286  }
2287}
2288
2289template <typename T>
2290inline void Pad(const T* input_data, const Dims<4>& input_dims,
2291                const std::vector<int>& left_paddings,
2292                const std::vector<int>& right_paddings, T* output_data,
2293                const Dims<4>& output_dims) {
2294  const int output_batch = ArraySize(output_dims, 3);
2295  const int output_height = ArraySize(output_dims, 2);
2296  const int output_width = ArraySize(output_dims, 1);
2297  const int output_depth = ArraySize(output_dims, 0);
2298
2299  const int left_b_padding = left_paddings[3];
2300  const int left_h_padding = left_paddings[2];
2301  const int left_w_padding = left_paddings[1];
2302  const int left_d_padding = left_paddings[0];
2303
2304  const int right_b_padding = right_paddings[3];
2305  const int right_h_padding = right_paddings[2];
2306  const int right_w_padding = right_paddings[1];
2307  const int right_d_padding = right_paddings[0];
2308
2309  const T* in_ptr = input_data;
2310  T* out_ptr = output_data;
2311  for (int out_b = 0; out_b < output_batch; ++out_b) {
2312    for (int out_h = 0; out_h < output_height; ++out_h) {
2313      for (int out_w = 0; out_w < output_width; ++out_w) {
2314        for (int out_d = 0; out_d < output_depth; ++out_d) {
2315          if (out_b < left_b_padding ||
2316              out_b >= output_batch - right_b_padding ||
2317              out_h < left_h_padding ||
2318              out_h >= output_height - right_h_padding ||
2319              out_w < left_w_padding ||
2320              out_w >= output_width - right_w_padding ||
2321              out_d < left_d_padding ||
2322              out_d >= output_depth - right_d_padding) {
2323            *out_ptr++ = 0;
2324          } else {
2325            *out_ptr++ = *in_ptr++;
2326          }
2327        }
2328      }
2329    }
2330  }
2331}
2332
2333inline bool LoopCondition(int index, int stop, int stride) {
2334  return stride > 0 ? index < stop : index > stop;
2335}
2336
2337inline int StartIndex(int start, int stride, int dim, bool masked) {
2338  return masked ? (stride > 0 ? 0 : dim - 1) : start;
2339}
2340
2341inline int StopIndex(int stop, int stride, int dim, bool masked) {
2342  return masked ? (stride > 0 ? dim : -1) : stop;
2343}
2344
2345template <typename T>
2346inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
2347                         int begin_mask, int end_mask,
2348                         const std::vector<int>& starts,
2349                         const std::vector<int>& stops,
2350                         const std::vector<int>& strides, T* output_data,
2351                         const Dims<4>& output_dims) {
2352  TFLITE_DCHECK_EQ(starts.size(), 4);
2353  TFLITE_DCHECK_EQ(stops.size(), 4);
2354  TFLITE_DCHECK_EQ(strides.size(), 4);
2355  const int start_b =
2356      StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8);
2357  const int stop_b =
2358      StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8);
2359  const int start_h =
2360      StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4);
2361  const int stop_h =
2362      StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4);
2363  const int start_w =
2364      StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2);
2365  const int stop_w =
2366      StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2);
2367  const int start_d =
2368      StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1);
2369  const int stop_d =
2370      StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1);
2371
2372  T* out_ptr = output_data;
2373  for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]);
2374       in_b += strides[3]) {
2375    for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]);
2376         in_h += strides[2]) {
2377      for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]);
2378           in_w += strides[1]) {
2379        for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]);
2380             in_d += strides[0]) {
2381          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
2382        }
2383      }
2384    }
2385  }
2386}
2387
2388template <typename T>
2389inline void Slice(const T* input_data, const Dims<4>& input_dims,
2390                  const std::vector<int>& begin, const std::vector<int>& size,
2391                  T* output_data, const Dims<4>& output_dims) {
2392  // TODO(dkalenichenko): This op only supports 4D tensors.
2393  TFLITE_DCHECK_EQ(begin.size(), 4);
2394  TFLITE_DCHECK_EQ(size.size(), 4);
2395  const int start_b = begin[3];
2396  const int stop_b =
2397      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
2398  const int start_h = begin[2];
2399  const int stop_h =
2400      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
2401  const int start_w = begin[1];
2402  const int stop_w =
2403      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
2404  const int start_d = begin[0];
2405  const int stop_d =
2406      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
2407
2408  T* out_ptr = output_data;
2409  for (int in_b = start_b; in_b < stop_b; ++in_b) {
2410    for (int in_h = start_h; in_h < stop_h; ++in_h) {
2411      for (int in_w = start_w; in_w < stop_w; ++in_w) {
2412        for (int in_d = start_d; in_d < stop_d; ++in_d) {
2413          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
2414        }
2415      }
2416    }
2417  }
2418}
2419
2420template <typename T>
2421inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
2422                 T* output_data, const int* output_dims,
2423                 const int output_num_dims, const int* axis,
2424                 const int num_axis_dimensions, bool keep_dims, int* temp_index,
2425                 int* resolved_axis) {
2426  // resets output data.
2427  size_t num_outputs = 1;
2428  for (int idx = 0; idx < output_num_dims; ++idx) {
2429    num_outputs *= static_cast<size_t>(output_dims[idx]);
2430  }
2431  for (size_t idx = 0; idx < num_outputs; ++idx) {
2432    output_data[idx] = 0;
2433  }
2434  // resets temp index.
2435  for (int idx = 0; idx < input_num_dims; ++idx) {
2436    temp_index[idx] = 0;
2437  }
2438  // resolves axis.
2439  int num_resolved_axis = 0;
2440  for (int idx = 0; idx < num_axis_dimensions; ++idx) {
2441    int current = axis[idx];
2442    TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0);
2443    if (current < 0) {
2444      current += input_num_dims;
2445    }
2446    bool is_dup = false;
2447    for (int j = 0; j < num_resolved_axis; ++j) {
2448      if (resolved_axis[j] == current) {
2449        is_dup = true;
2450        break;
2451      }
2452    }
2453    if (!is_dup) {
2454      resolved_axis[num_resolved_axis++] = current;
2455    }
2456  }
2457  // iterates through input_data.
2458  for (bool has_next = true; has_next;
2459       has_next = NextIndex(input_num_dims, input_dims, temp_index)) {
2460    size_t input_offset =
2461        ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
2462    size_t output_offset =
2463        ReducedOutputOffset(input_num_dims, input_dims, temp_index,
2464                            num_resolved_axis, resolved_axis);
2465    output_data[output_offset] += input_data[input_offset];
2466  }
2467  // takes average by num of elements added to get mean.
2468  size_t num_elements_in_axis = 1;
2469  for (int idx = 0; idx < num_resolved_axis; ++idx) {
2470    num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
2471  }
2472  for (size_t idx = 0; idx < num_outputs; ++idx) {
2473    output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
2474                                      num_elements_in_axis);
2475  }
2476}
2477
2478template <typename T>
2479inline void Mean(const T* input_data, const Dims<4>& input_dims,
2480                 const std::vector<int>& reduction_indices, T* output_data,
2481                 const Dims<4>& output_dims) {
2482  const int output_batch = ArraySize(output_dims, 3);
2483  const int output_height = ArraySize(output_dims, 2);
2484  const int output_width = ArraySize(output_dims, 1);
2485  const int output_depth = ArraySize(output_dims, 0);
2486
2487  const int input_height = ArraySize(input_dims, 2);
2488  const int input_width = ArraySize(input_dims, 1);
2489
2490  // The current implementation only supports simultaneous reduction over
2491  // width and height.
2492  TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
2493  TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
2494                (reduction_indices[0] == 2 && reduction_indices[1] == 1));
2495  TFLITE_DCHECK_EQ(output_height, 1);
2496  TFLITE_DCHECK_EQ(output_width, 1);
2497
2498  for (int out_b = 0; out_b < output_batch; ++out_b) {
2499    for (int out_d = 0; out_d < output_depth; ++out_d) {
2500      float value = 0;
2501      for (int in_h = 0; in_h < input_height; ++in_h) {
2502        for (int in_w = 0; in_w < input_width; ++in_w) {
2503          value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
2504        }
2505      }
2506      output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
2507          value / (input_width * input_height);
2508    }
2509  }
2510}
2511
2512template <typename T>
2513void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
2514         const Dims<4>& input2_dims, T* output_data,
2515         const Dims<4>& output_dims) {
2516  NdArrayDesc<4> desc1;
2517  NdArrayDesc<4> desc2;
2518  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
2519
2520  // In Tensorflow, the dimensions are canonically named (batch_number, row,
2521  // col, channel), with extents (batches, height, width, depth), with the
2522  // trailing dimension changing most rapidly (channels has the smallest stride,
2523  // typically 1 element).
2524  //
2525  // In generated C code, we store arrays with the dimensions reversed. The
2526  // first dimension has smallest stride.
2527  //
2528  // We name our variables by their Tensorflow convention, but generate C code
2529  // nesting loops such that the innermost loop has the smallest stride for the
2530  // best cache behavior.
2531  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
2532    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
2533      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
2534        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
2535          output_data[Offset(output_dims, c, x, y, b)] =
2536              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
2537              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
2538        }
2539      }
2540    }
2541  }
2542}
2543
2544template <typename T>
2545void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
2546                       const T* input2_data, T* output_data,
2547                       const Dims<4>& output_dims) {
2548  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
2549  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
2550  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
2551  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
2552
2553  auto min_value = input2_data[0];
2554
2555  for (int b = 0; b < batches; b++) {
2556    for (int y = 0; y < input_height; y++) {
2557      for (int x = 0; x < input_width; x++) {
2558        for (int c = 0; c < depth; c++) {
2559          int offset = Offset(input1_dims, c, x, y, b);
2560          output_data[offset] =
2561              input1_data[offset] > min_value ? min_value : input1_data[offset];
2562        }
2563      }
2564    }
2565  }
2566}
2567
2568template <typename T>
2569void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
2570                       const T* input2_data, T* output_data,
2571                       const Dims<4>& output_dims) {
2572  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
2573  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
2574  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
2575  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
2576
2577  auto max_value = input2_data[0];
2578
2579  for (int b = 0; b < batches; b++) {
2580    for (int y = 0; y < input_height; y++) {
2581      for (int x = 0; x < input_width; x++) {
2582        for (int c = 0; c < depth; c++) {
2583          int offset = Offset(input1_dims, c, x, y, b);
2584          output_data[offset] =
2585              input1_data[offset] < max_value ? max_value : input1_data[offset];
2586        }
2587      }
2588    }
2589  }
2590}
2591
2592template <typename T1, typename T2, typename T3>
2593void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
2594            T2* output_data, const Dims<4>& output_dims) {
2595  // The current ArgMax implemention can only determine the index of the maximum
2596  // value in the last dimension. So the axis argument is ignored.
2597  TFLITE_DCHECK_EQ(axis[0], 3);
2598
2599  // For ArgMax, the number of output dimensions = (number of input dimensions -
2600  // 1). For the sake of simplicity, the output dimensions are equal to the
2601  // input dimensions here. We enforce the constraint that the last dimension
2602  // must always be 1.
2603  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
2604  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2605  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2606  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2607  const int depth = ArraySize(input_dims, 0);
2608  for (int b = 0; b < batches; ++b) {
2609    for (int y = 0; y < height; ++y) {
2610      for (int x = 0; x < width; ++x) {
2611        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
2612        int max_index = 0;
2613        for (int d = 1; d < depth; ++d) {
2614          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
2615          if (curr_value > max_value) {
2616            max_value = curr_value;
2617            max_index = d;
2618          }
2619        }
2620        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
2621      }
2622    }
2623  }
2624}
2625
2626template <typename T>
2627void Transpose(const T* input, const Dims<4>& input_dims, T* output,
2628               const Dims<4>& output_dims, int* permuted_axes) {
2629  int out_sizes[4];
2630  // Compute the inverse permutation array so we can do an output centered
2631  // transpose. Also, check to make sure output_dims is matching input_dims.
2632  for (int k = 0; k < 4; k++) {
2633    out_sizes[k] =
2634        MatchingArraySize(input_dims, permuted_axes[k], output_dims, k);
2635  }
2636
2637  // Naive transpose loop (iterate on output index and compute input index).
2638  int o[4];  // loop index (on output).
2639  int i[4];
2640  for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) {
2641    i[permuted_axes[3]] = o[3];
2642    for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) {
2643      i[permuted_axes[2]] = o[2];
2644      for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) {
2645        i[permuted_axes[1]] = o[1];
2646        for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) {
2647          i[permuted_axes[0]] = o[0];
2648          output[Offset(output_dims, o)] = input[Offset(input_dims, i)];
2649        }
2650      }
2651    }
2652  }
2653}
2654
2655}  // namespace reference_ops
2656}  // namespace tflite
2657
2658#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
2659