reference_ops.h revision 9393d604ebd63664a27d28617f95fa5f60495270
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7    http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
16#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
17
18#include <stdint.h>
19#include <sys/types.h>
20#include <algorithm>
21#include <cmath>
22#include <limits>
23#include <memory>
24#include <type_traits>
25
26#include "third_party/eigen3/Eigen/Core"
27#include "fixedpoint/fixedpoint.h"
28#include "public/gemmlowp.h"
29#include "tensorflow/contrib/lite/kernels/internal/common.h"
30#include "tensorflow/contrib/lite/kernels/internal/round.h"
31#include "tensorflow/contrib/lite/kernels/internal/types.h"
32
33namespace tflite {
34namespace reference_ops {
35
36inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
37    int32 x, int32 quantized_multiplier, int right_shift) {
38  using gemmlowp::RoundingDivideByPOT;
39  using gemmlowp::SaturatingRoundingDoublingHighMul;
40  return RoundingDivideByPOT(
41      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
42}
43
44inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
45    int32 x, int32 quantized_multiplier, int left_shift) {
46  using gemmlowp::SaturatingRoundingDoublingHighMul;
47  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
48                                           quantized_multiplier);
49}
50
51template <typename T>
52int CountLeadingZeros(T integer_input) {
53  static_assert(std::is_unsigned<T>::value,
54                "Only unsigned integer types handled.");
55  const T one_in_leading_positive = static_cast<T>(1)
56                                    << (std::numeric_limits<T>::digits - 1);
57  int leading_zeros = 0;
58  while (integer_input < one_in_leading_positive) {
59    integer_input <<= 1;
60    ++leading_zeros;
61  }
62  return leading_zeros;
63}
64
65// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
66// BROADCASTING.
67//
68// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
69// rectangular array of numbers.
70//
71// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
72// However, as Dims<N> is to be deprecated, this class exists as an adaptor
73// to enable simple unoptimized implementations of element-wise broadcasting
74// operations.
75template <int N>
76struct NdArrayDesc {
77  // The "extent" of each dimension. Indices along dimension d must be in the
78  // half-open interval [0, extents[d]).
79  int extents[N];
80
81  // The number of *elements* (not bytes) between consecutive indices of each
82  // dimension.
83  int strides[N];
84};
85
86// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
87// ELEMENT-WISE BROADCASTING.
88//
89// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
90inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
91                            int i3) {
92  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
93  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
94  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
95  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
96  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
97         i3 * desc.strides[3];
98}
99
100// Given the dimensions of the operands for an element-wise binary broadcast,
101// adjusts them so that they can be directly iterated over with simple loops.
102// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
103// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
104//
105// This function assumes that the two input shapes are compatible up to
106// broadcasting and the shorter one has already been prepended with 1s to be the
107// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
108// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
109// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
110// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
111//
112// When two shapes are compatible up to broadcasting, for each dimension d,
113// the input extents are either equal, or one of them is 1.
114//
115// This function performs the following for each dimension d:
116// - If the extents are equal, then do nothing since the loop that walks over
117//   both of the input arrays is correct.
118// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
119//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
120//   array0 to be referenced *at any index* in dimension d and still access the
121//   same slice.
122template <int N>
123inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
124                                                const Dims<N>& input1_dims,
125                                                NdArrayDesc<N>* desc0_out,
126                                                NdArrayDesc<N>* desc1_out) {
127  TFLITE_DCHECK(desc0_out != nullptr);
128  TFLITE_DCHECK(desc1_out != nullptr);
129
130  // Copy dims to desc.
131  for (int i = 0; i < N; ++i) {
132    desc0_out->extents[i] = input0_dims.sizes[i];
133    desc0_out->strides[i] = input0_dims.strides[i];
134    desc1_out->extents[i] = input1_dims.sizes[i];
135    desc1_out->strides[i] = input1_dims.strides[i];
136  }
137
138  // Walk over each dimension. If the extents are equal do nothing.
139  // Otherwise, set the desc with extent 1 to have extent equal to the other and
140  // stride 0.
141  for (int i = 0; i < N; ++i) {
142    const int extent0 = ArraySize(input0_dims, i);
143    const int extent1 = ArraySize(input1_dims, i);
144    if (extent0 != extent1) {
145      if (extent0 == 1) {
146        desc0_out->strides[i] = 0;
147        desc0_out->extents[i] = extent1;
148      } else {
149        TFLITE_DCHECK_EQ(extent1, 1);
150        desc1_out->strides[i] = 0;
151        desc1_out->extents[i] = extent0;
152      }
153    }
154  }
155}
156
157inline void Conv(const float* input_data, const Dims<4>& input_dims,
158                 const float* filter_data, const Dims<4>& filter_dims,
159                 const float* bias_data, const Dims<4>& bias_dims,
160                 int stride_width, int stride_height, int pad_width,
161                 int pad_height, float output_activation_min,
162                 float output_activation_max, float* output_data,
163                 const Dims<4>& output_dims, float* im2col_data,
164                 const Dims<4>& im2col_dims) {
165  (void)im2col_data;  // only used in optimized code.
166  (void)im2col_dims;  // only used in optimized code.
167  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
168  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
169  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
170  if (bias_data) {
171    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
172  }
173  const int input_height = ArraySize(input_dims, 2);
174  const int input_width = ArraySize(input_dims, 1);
175  const int filter_height = ArraySize(filter_dims, 2);
176  const int filter_width = ArraySize(filter_dims, 1);
177  const int output_height = ArraySize(output_dims, 2);
178  const int output_width = ArraySize(output_dims, 1);
179  for (int batch = 0; batch < batches; ++batch) {
180    for (int out_y = 0; out_y < output_height; ++out_y) {
181      for (int out_x = 0; out_x < output_width; ++out_x) {
182        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
183          const int in_x_origin = (out_x * stride_width) - pad_width;
184          const int in_y_origin = (out_y * stride_height) - pad_height;
185          float total = 0.f;
186          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
187            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
188              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
189                const int in_x = in_x_origin + filter_x;
190                const int in_y = in_y_origin + filter_y;
191                // If the location is outside the bounds of the input image,
192                // use zero as a default value.
193                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
194                    (in_y < input_height)) {
195                  float input_value = input_data[Offset(input_dims, in_channel,
196                                                        in_x, in_y, batch)];
197                  float filter_value =
198                      filter_data[Offset(filter_dims, in_channel, filter_x,
199                                         filter_y, out_channel)];
200                  total += (input_value * filter_value);
201                }
202              }
203            }
204          }
205          float bias_value = 0.0f;
206          if (bias_data) {
207            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
208          }
209          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
210              ActivationFunctionWithMinMax(total + bias_value,
211                                           output_activation_min,
212                                           output_activation_max);
213        }
214      }
215    }
216  }
217}
218
219// legacy, for compatibility with old checked-in code
220template <FusedActivationFunctionType Ac>
221void Conv(const float* input_data, const Dims<4>& input_dims,
222          const float* filter_data, const Dims<4>& filter_dims,
223          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
224          int stride_height, int pad_width, int pad_height, float* output_data,
225          const Dims<4>& output_dims, float* im2col_data,
226          const Dims<4>& im2col_dims) {
227  float output_activation_min, output_activation_max;
228  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
229  Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims,
230       stride_width, stride_height, pad_width, pad_height,
231       output_activation_min, output_activation_max, output_data, output_dims,
232       im2col_data, im2col_dims);
233}
234
235// legacy, for compatibility with old checked-in code
236template <FusedActivationFunctionType Ac>
237void Conv(const float* input_data, const Dims<4>& input_dims,
238          const float* filter_data, const Dims<4>& filter_dims,
239          const float* bias_data, const Dims<4>& bias_dims, int stride,
240          int pad_width, int pad_height, float* output_data,
241          const Dims<4>& output_dims, float* im2col_data,
242          const Dims<4>& im2col_dims) {
243  Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data,
244           bias_dims, stride, stride, pad_width, pad_height, output_data,
245           output_dims, im2col_data, im2col_dims);
246}
247
248inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
249                 int32 input_offset, const uint8* filter_data,
250                 const Dims<4>& filter_dims, int32 filter_offset,
251                 const int32* bias_data, const Dims<4>& bias_dims,
252                 int stride_width, int stride_height, int pad_width,
253                 int pad_height, int32 output_offset, int32 output_multiplier,
254                 int output_shift, int32 output_activation_min,
255                 int32 output_activation_max, uint8* output_data,
256                 const Dims<4>& output_dims, uint8* im2col_data,
257                 const Dims<4>& im2col_dims,
258                 gemmlowp::GemmContext* gemm_context) {
259  (void)im2col_data;   // only used in optimized code.
260  (void)im2col_dims;   // only used in optimized code.
261  (void)gemm_context;  // only used in optimized code.
262  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
263  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
264  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
265  const int output_depth =
266      MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0);
267  const int input_height = ArraySize(input_dims, 2);
268  const int input_width = ArraySize(input_dims, 1);
269  const int filter_height = ArraySize(filter_dims, 2);
270  const int filter_width = ArraySize(filter_dims, 1);
271  const int output_height = ArraySize(output_dims, 2);
272  const int output_width = ArraySize(output_dims, 1);
273  for (int batch = 0; batch < batches; ++batch) {
274    for (int out_y = 0; out_y < output_height; ++out_y) {
275      for (int out_x = 0; out_x < output_width; ++out_x) {
276        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
277          const int in_x_origin = (out_x * stride_width) - pad_width;
278          const int in_y_origin = (out_y * stride_height) - pad_height;
279          int32 acc = 0;
280          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
281            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
282              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
283                const int in_x = in_x_origin + filter_x;
284                const int in_y = in_y_origin + filter_y;
285                // If the location is outside the bounds of the input image,
286                // use zero as a default value.
287                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
288                    (in_y < input_height)) {
289                  int32 input_val = input_data[Offset(input_dims, in_channel,
290                                                      in_x, in_y, batch)];
291                  int32 filter_val =
292                      filter_data[Offset(filter_dims, in_channel, filter_x,
293                                         filter_y, out_channel)];
294                  acc +=
295                      (filter_val + filter_offset) * (input_val + input_offset);
296                }
297              }
298            }
299          }
300          if (bias_data) {
301            acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
302          }
303          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
304              acc, output_multiplier, output_shift);
305          acc += output_offset;
306          acc = std::max(acc, output_activation_min);
307          acc = std::min(acc, output_activation_max);
308          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
309              static_cast<uint8>(acc);
310        }
311      }
312    }
313  }
314}
315
316// legacy, for compatibility with old checked-in code
317template <FusedActivationFunctionType Ac>
318inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
319                 int32 input_offset, const uint8* filter_data,
320                 const Dims<4>& filter_dims, int32 filter_offset,
321                 const int32* bias_data, const Dims<4>& bias_dims,
322                 int stride_width, int stride_height, int pad_width,
323                 int pad_height, int32 output_offset, int32 output_multiplier,
324                 int output_shift, int32 output_activation_min,
325                 int32 output_activation_max, uint8* output_data,
326                 const Dims<4>& output_dims, uint8* im2col_data,
327                 const Dims<4>& im2col_dims,
328                 gemmlowp::GemmContext* gemm_context) {
329  static_assert(Ac == FusedActivationFunctionType::kNone ||
330                    Ac == FusedActivationFunctionType::kRelu ||
331                    Ac == FusedActivationFunctionType::kRelu6 ||
332                    Ac == FusedActivationFunctionType::kRelu1,
333                "");
334  if (Ac == FusedActivationFunctionType::kNone) {
335    TFLITE_DCHECK_EQ(output_activation_min, 0);
336    TFLITE_DCHECK_EQ(output_activation_max, 255);
337  }
338  Conv(input_data, input_dims, input_offset, filter_data, filter_dims,
339       filter_offset, bias_data, bias_dims, stride_width, stride_height,
340       pad_width, pad_height, output_offset, output_multiplier, output_shift,
341       output_activation_min, output_activation_max, output_data, output_dims,
342       im2col_data, im2col_dims, gemm_context);
343}
344
345// legacy, for compatibility with old checked-in code
346template <FusedActivationFunctionType Ac>
347void Conv(const uint8* input_data, const Dims<4>& input_dims,
348          int32 input_offset, const uint8* filter_data,
349          const Dims<4>& filter_dims, int32 filter_offset,
350          const int32* bias_data, const Dims<4>& bias_dims, int stride,
351          int pad_width, int pad_height, int32 output_offset,
352          int32 output_multiplier, int output_shift,
353          int32 output_activation_min, int32 output_activation_max,
354          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
355          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
356  Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims,
357           filter_offset, bias_data, bias_dims, stride, stride, pad_width,
358           pad_height, output_offset, output_multiplier, output_shift,
359           output_activation_min, output_activation_max, output_data,
360           output_dims, im2col_data, im2col_dims, gemm_context);
361}
362
363template <typename T>
364inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
365                         int block_size, T* output_data,
366                         const Dims<4>& output_dims) {
367  const int input_depth = ArraySize(input_dims, 0);
368  const int input_width = ArraySize(input_dims, 1);
369  const int input_height = ArraySize(input_dims, 2);
370  const int input_batch = ArraySize(input_dims, 3);
371
372  const int output_depth = ArraySize(output_dims, 0);
373  const int output_width = ArraySize(output_dims, 1);
374  const int output_height = ArraySize(output_dims, 2);
375  const int output_batch = ArraySize(output_dims, 3);
376
377  TFLITE_DCHECK_EQ(input_width * block_size, output_width);
378  TFLITE_DCHECK_EQ(input_height * block_size, output_height);
379  TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size);
380  TFLITE_DCHECK_EQ(input_batch, output_batch);
381
382  for (int out_b = 0; out_b < output_batch; ++out_b) {
383    for (int out_h = 0; out_h < output_height; ++out_h) {
384      for (int out_w = 0; out_w < output_width; ++out_w) {
385        for (int out_d = 0; out_d < output_depth; ++out_d) {
386          const int in_d =
387              out_d + ((out_h % block_size) * block_size + out_w % block_size) *
388                          output_depth;
389          const int in_w = out_w / block_size;
390          const int in_h = out_h / block_size;
391          const int in_b = out_b;
392
393          const int output_index =
394              Offset(output_dims, out_d, out_w, out_h, out_b);
395          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
396
397          output_data[output_index] = input_data[input_index];
398        }
399      }
400    }
401  }
402}
403
404template <typename T>
405inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
406                         int block_size, T* output_data,
407                         const Dims<4>& output_dims) {
408  const int input_depth = ArraySize(input_dims, 0);
409  const int input_width = ArraySize(input_dims, 1);
410  const int input_height = ArraySize(input_dims, 2);
411  const int input_batch = ArraySize(input_dims, 3);
412
413  const int output_depth = ArraySize(output_dims, 0);
414  const int output_width = ArraySize(output_dims, 1);
415  const int output_height = ArraySize(output_dims, 2);
416  const int output_batch = ArraySize(output_dims, 3);
417
418  TFLITE_DCHECK_EQ(input_width, output_width * block_size);
419  TFLITE_DCHECK_EQ(input_height, output_height * block_size);
420  TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth);
421  TFLITE_DCHECK_EQ(input_batch, output_batch);
422
423  for (int in_b = 0; in_b < input_batch; ++in_b) {
424    for (int in_h = 0; in_h < input_height; ++in_h) {
425      for (int in_w = 0; in_w < input_width; ++in_w) {
426        for (int in_d = 0; in_d < input_depth; ++in_d) {
427          const int out_d =
428              in_d + ((in_h % block_size) * block_size + in_w % block_size) *
429                         input_depth;
430          const int out_w = in_w / block_size;
431          const int out_h = in_h / block_size;
432          const int out_b = in_b;
433
434          const int output_index =
435              Offset(output_dims, out_d, out_w, out_h, out_b);
436          const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b);
437
438          output_data[output_index] = input_data[input_index];
439        }
440      }
441    }
442  }
443}
444
445inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
446                           const float* weights_data,
447                           const Dims<4>& weights_dims, const float* bias_data,
448                           const Dims<4>& bias_dims,
449                           float output_activation_min,
450                           float output_activation_max, float* output_data,
451                           const Dims<4>& output_dims) {
452  // TODO(benoitjacob): This really should be:
453  //     const int batches = ArraySize(output_dims, 1);
454  // but the current --variable_batch hack consists in overwriting the 3rd
455  // dimension with the runtime batch size, as we don't keep track for each
456  // array of which dimension is the batch dimension in it.
457  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
458                      ArraySize(output_dims, 3);
459  const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
460  const int accum_depth = ArraySize(weights_dims, 0);
461  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
462  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
463  for (int b = 0; b < batches; ++b) {
464    for (int out_c = 0; out_c < output_depth; ++out_c) {
465      float total = 0.f;
466      for (int d = 0; d < accum_depth; ++d) {
467        total += input_data[b * accum_depth + d] *
468                 weights_data[out_c * accum_depth + d];
469      }
470      float bias_value = 0.0f;
471      if (bias_data) {
472        bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
473      }
474      output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
475          total + bias_value, output_activation_min, output_activation_max);
476    }
477  }
478}
479
480// legacy, for compatibility with old checked-in code
481template <FusedActivationFunctionType Ac>
482void FullyConnected(const float* input_data, const Dims<4>& input_dims,
483                    const float* weights_data, const Dims<4>& weights_dims,
484                    const float* bias_data, const Dims<4>& bias_dims,
485                    float* output_data, const Dims<4>& output_dims) {
486  float output_activation_min, output_activation_max;
487  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
488  FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data,
489                 bias_dims, output_activation_min, output_activation_max,
490                 output_data, output_dims);
491}
492
493inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
494                           int32 input_offset, const uint8* filter_data,
495                           const Dims<4>& filter_dims, int32 filter_offset,
496                           const int32* bias_data, const Dims<4>& bias_dims,
497                           int32 output_offset, int32 output_multiplier,
498                           int output_shift, int32 output_activation_min,
499                           int32 output_activation_max, uint8* output_data,
500                           const Dims<4>& output_dims,
501                           gemmlowp::GemmContext* gemm_context) {
502  (void)gemm_context;  // only used in optimized code.
503  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
504  // TODO(benoitjacob): This really should be:
505  //     const int batches = ArraySize(output_dims, 1);
506  // but the current --variable_batch hack consists in overwriting the 3rd
507  // dimension with the runtime batch size, as we don't keep track for each
508  // array of which dimension is the batch dimension in it.
509  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
510                      ArraySize(output_dims, 3);
511  const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
512  const int accum_depth = ArraySize(filter_dims, 0);
513  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
514  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
515  for (int b = 0; b < batches; ++b) {
516    for (int out_c = 0; out_c < output_depth; ++out_c) {
517      int32 acc = 0;
518      for (int d = 0; d < accum_depth; ++d) {
519        int32 input_val = input_data[b * accum_depth + d];
520        int32 filter_val = filter_data[out_c * accum_depth + d];
521        acc += (filter_val + filter_offset) * (input_val + input_offset);
522      }
523      if (bias_data) {
524        acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
525      }
526      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
527                                                        output_shift);
528      acc += output_offset;
529      acc = std::max(acc, output_activation_min);
530      acc = std::min(acc, output_activation_max);
531      output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
532    }
533  }
534}
535
536// legacy, for compatibility with old checked-in code
537template <FusedActivationFunctionType Ac>
538void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
539                    int32 input_offset, const uint8* filter_data,
540                    const Dims<4>& filter_dims, int32 filter_offset,
541                    const int32* bias_data, const Dims<4>& bias_dims,
542                    int32 output_offset, int32 output_multiplier,
543                    int output_shift, int32 output_activation_min,
544                    int32 output_activation_max, uint8* output_data,
545                    const Dims<4>& output_dims,
546                    gemmlowp::GemmContext* gemm_context) {
547  static_assert(Ac == FusedActivationFunctionType::kNone ||
548                    Ac == FusedActivationFunctionType::kRelu ||
549                    Ac == FusedActivationFunctionType::kRelu6 ||
550                    Ac == FusedActivationFunctionType::kRelu1,
551                "");
552  if (Ac == FusedActivationFunctionType::kNone) {
553    TFLITE_DCHECK_EQ(output_activation_min, 0);
554    TFLITE_DCHECK_EQ(output_activation_max, 255);
555  }
556  FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims,
557                 filter_offset, bias_data, bias_dims, output_offset,
558                 output_multiplier, output_shift, output_activation_min,
559                 output_activation_max, output_data, output_dims, gemm_context);
560}
561
562template <FusedActivationFunctionType Ac>
563void NonGlobalBatchNormalization(
564    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
565    const Dims<4>& mean_dims, const float* multiplier_data,
566    const Dims<4>& multiplier_dims, const float* offset_data,
567    const Dims<4>& offset_dims, float* output_data,
568    const Dims<4>& output_dims) {
569  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
570  const int height =
571      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
572                        offset_dims, 2, output_dims, 2);
573  const int width =
574      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
575                        offset_dims, 1, output_dims, 1);
576  const int depth =
577      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
578                        offset_dims, 0, output_dims, 0);
579
580  for (int b = 0; b < batches; ++b) {
581    for (int y = 0; y < height; ++y) {
582      for (int x = 0; x < width; ++x) {
583        for (int c = 0; c < depth; ++c) {
584          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
585              (input_data[Offset(input_dims, c, x, y, b)] -
586               mean_data[Offset(mean_dims, c, x, y, 0)]) *
587                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
588              offset_data[Offset(offset_dims, c, x, y, 0)]);
589        }
590      }
591    }
592  }
593}
594
595template <FusedActivationFunctionType Ac>
596void GlobalBatchNormalization(const float* input_data,
597                              const Dims<4>& input_dims, const float* mean_data,
598                              const Dims<4>& mean_dims,
599                              const float* multiplier_data,
600                              const Dims<4>& multiplier_dims,
601                              const float* offset_data,
602                              const Dims<4>& offset_dims, float* output_data,
603                              const Dims<4>& output_dims) {
604  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
605  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
606  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
607  const int depth =
608      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
609                        offset_dims, 0, output_dims, 0);
610
611  for (int b = 0; b < batches; ++b) {
612    for (int y = 0; y < height; ++y) {
613      for (int x = 0; x < width; ++x) {
614        for (int c = 0; c < depth; ++c) {
615          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
616              (input_data[Offset(input_dims, c, x, y, b)] -
617               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
618                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
619              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
620        }
621      }
622    }
623  }
624}
625
626inline void Relu(const float* input_data, const Dims<4>& input_dims,
627                 float* output_data, const Dims<4>& output_dims) {
628  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
629  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
630  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
631  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
632  for (int b = 0; b < batches; ++b) {
633    for (int y = 0; y < height; ++y) {
634      for (int x = 0; x < width; ++x) {
635        for (int c = 0; c < depth; ++c) {
636          float val = input_data[Offset(input_dims, c, x, y, b)];
637          const float lower = 0;
638          float clamped = val < lower ? lower : val;
639          output_data[Offset(output_dims, c, x, y, b)] = clamped;
640        }
641      }
642    }
643  }
644}
645
646inline void Relu1(const float* input_data, const Dims<4>& input_dims,
647                  float* output_data, const Dims<4>& output_dims) {
648  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
649  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
650  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
651  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
652  for (int b = 0; b < batches; ++b) {
653    for (int y = 0; y < height; ++y) {
654      for (int x = 0; x < width; ++x) {
655        for (int c = 0; c < depth; ++c) {
656          float val = input_data[Offset(input_dims, c, x, y, b)];
657          const float upper = 1;
658          const float lower = -1;
659          float clamped = val > upper ? upper : val < lower ? lower : val;
660          output_data[Offset(output_dims, c, x, y, b)] = clamped;
661        }
662      }
663    }
664  }
665}
666
667inline void Relu6(const float* input_data, const Dims<4>& input_dims,
668                  float* output_data, const Dims<4>& output_dims) {
669  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
670  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
671  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
672  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
673  for (int b = 0; b < batches; ++b) {
674    for (int y = 0; y < height; ++y) {
675      for (int x = 0; x < width; ++x) {
676        for (int c = 0; c < depth; ++c) {
677          float val = input_data[Offset(input_dims, c, x, y, b)];
678          const float upper = 6;
679          const float lower = 0;
680          float clamped = val > upper ? upper : val < lower ? lower : val;
681          output_data[Offset(output_dims, c, x, y, b)] = clamped;
682        }
683      }
684    }
685  }
686}
687
688template <FusedActivationFunctionType Ac>
689void L2Normalization(const float* input_data, const Dims<4>& input_dims,
690                     float* output_data, const Dims<4>& output_dims) {
691  static_assert(Ac == FusedActivationFunctionType::kNone, "");
692  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
693  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
694  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
695  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
696  for (int b = 0; b < batches; ++b) {
697    for (int y = 0; y < height; ++y) {
698      for (int x = 0; x < width; ++x) {
699        float squared_l2_norm = 0;
700        for (int c = 0; c < depth; ++c) {
701          float val = input_data[Offset(input_dims, c, x, y, b)];
702          squared_l2_norm += val * val;
703        }
704        float l2_norm = std::sqrt(squared_l2_norm);
705        for (int c = 0; c < depth; ++c) {
706          output_data[Offset(output_dims, c, x, y, b)] =
707              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
708        }
709      }
710    }
711  }
712}
713
714inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
715                                          int* output_shift) {
716  *output_shift = 11;
717  while (input >= (1 << 29)) {
718    input /= 4;
719    ++*output_shift;
720  }
721  TFLITE_DCHECK_GT(input, 0);
722  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
723  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
724  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
725  *output_shift -= left_shift_bit_pairs;
726  input <<= 2 * left_shift_bit_pairs;
727  TFLITE_DCHECK_GE(input, (1 << 27));
728  TFLITE_DCHECK_LT(input, (1 << 29));
729  using gemmlowp::FixedPoint;
730  using gemmlowp::Rescale;
731  using gemmlowp::SaturatingRoundingMultiplyByPOT;
732  // Using 3 integer bits gives us enough room for the internal arithmetic in
733  // this Newton-Raphson iteration.
734  using F3 = FixedPoint<int32, 3>;
735  using F0 = FixedPoint<int32, 0>;
736  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
737  const F3 fixedpoint_half_input =
738      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
739  const F3 fixedpoint_half_three =
740      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
741  // Newton-Raphson iteration
742  // Naive unoptimized starting guess: x = 1
743  F3 x = F3::One();
744  // Naive unoptimized number of iterations: 5
745  for (int i = 0; i < 5; i++) {
746    const F3 x3 = Rescale<3>(x * x * x);
747    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
748  }
749  const F0 fixedpoint_half_sqrt_2 =
750      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
751  x = x * fixedpoint_half_sqrt_2;
752  *output_inv_sqrt = x.raw();
753  if (*output_shift < 0) {
754    *output_inv_sqrt <<= -*output_shift;
755    *output_shift = 0;
756  }
757}
758
759inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
760                            int32 input_zero_point, uint8* output_data,
761                            const Dims<4>& output_dims) {
762  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
763  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
764  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
765  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
766  TFLITE_DCHECK_EQ(batches, 1);
767  TFLITE_DCHECK_EQ(height, 1);
768  TFLITE_DCHECK_EQ(width, 1);
769  int32 square_l2_norm = 0;
770  for (int i = 0; i < depth; i++) {
771    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
772    square_l2_norm += diff * diff;
773  }
774  int32 inv_l2norm_multiplier;
775  int inv_l2norm_shift;
776  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
777                                &inv_l2norm_shift);
778
779  for (int i = 0; i < depth; i++) {
780    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
781    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
782        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
783    int32 unclamped_output_val = 128 + rescaled_diff;
784    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
785    output_data[Offset(output_dims, i, 0, 0, 0)] =
786        static_cast<uint8>(output_val);
787  }
788}
789
790inline void Add(const float* input1_data, const Dims<4>& input1_dims,
791                const float* input2_data, const Dims<4>& input2_dims,
792                float output_activation_min, float output_activation_max,
793                float* output_data, const Dims<4>& output_dims) {
794  const int batches =
795      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
796  const int height =
797      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
798  const int width =
799      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
800  const int depth =
801      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
802  for (int b = 0; b < batches; ++b) {
803    for (int y = 0; y < height; ++y) {
804      for (int x = 0; x < width; ++x) {
805        for (int c = 0; c < depth; ++c) {
806          output_data[Offset(output_dims, c, x, y, b)] =
807              ActivationFunctionWithMinMax(
808                  input1_data[Offset(input1_dims, c, x, y, b)] +
809                      input2_data[Offset(input2_dims, c, x, y, b)],
810                  output_activation_min, output_activation_max);
811        }
812      }
813    }
814  }
815}
816
817// legacy, for compatibility with old checked-in code
818template <FusedActivationFunctionType Ac>
819void Add(const float* input1_data, const Dims<4>& input1_dims,
820         const float* input2_data, const Dims<4>& input2_dims,
821         float* output_data, const Dims<4>& output_dims) {
822  float output_activation_min, output_activation_max;
823  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
824
825  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
826      output_activation_max, output_data, output_dims);
827}
828
829template <FusedActivationFunctionType Ac>
830inline void Add(int left_shift, const uint8* input1_data,
831                const Dims<4>& input1_dims, int32 input1_offset,
832                int32 input1_multiplier, int input1_shift,
833                const uint8* input2_data, const Dims<4>& input2_dims,
834                int32 input2_offset, int32 input2_multiplier, int input2_shift,
835                int32 output_offset, int32 output_multiplier, int output_shift,
836                int32 output_activation_min, int32 output_activation_max,
837                uint8* output_data, const Dims<4>& output_dims) {
838  static_assert(Ac == FusedActivationFunctionType::kNone ||
839                    Ac == FusedActivationFunctionType::kRelu ||
840                    Ac == FusedActivationFunctionType::kRelu6 ||
841                    Ac == FusedActivationFunctionType::kRelu1,
842                "");
843  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
844  if (Ac == FusedActivationFunctionType::kNone) {
845    TFLITE_DCHECK_EQ(output_activation_min, 0);
846    TFLITE_DCHECK_EQ(output_activation_max, 255);
847  }
848  const int batches =
849      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
850  const int height =
851      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
852  const int width =
853      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
854  const int depth =
855      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
856  for (int b = 0; b < batches; ++b) {
857    for (int y = 0; y < height; ++y) {
858      for (int x = 0; x < width; ++x) {
859        for (int c = 0; c < depth; ++c) {
860          const int32 input1_val =
861              input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
862          const int32 input2_val =
863              input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
864          const int32 shifted_input1_val = input1_val * (1 << left_shift);
865          const int32 shifted_input2_val = input2_val * (1 << left_shift);
866          const int32 scaled_input1_val =
867              MultiplyByQuantizedMultiplierSmallerThanOne(
868                  shifted_input1_val, input1_multiplier, input1_shift);
869          const int32 scaled_input2_val =
870              MultiplyByQuantizedMultiplierSmallerThanOne(
871                  shifted_input2_val, input2_multiplier, input2_shift);
872          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
873          const int32 raw_output =
874              MultiplyByQuantizedMultiplierSmallerThanOne(
875                  raw_sum, output_multiplier, output_shift) +
876              output_offset;
877          const int32 clamped_output =
878              std::min(output_activation_max,
879                       std::max(output_activation_min, raw_output));
880          output_data[Offset(output_dims, c, x, y, b)] =
881              static_cast<uint8>(clamped_output);
882        }
883      }
884    }
885  }
886}
887
888// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
889// dimensionality if the runtime code does a single loop over one dimension
890// that handles broadcasting as the base case. The code generator would then
891// generate max(D1, D2) nested for loops.
892template <FusedActivationFunctionType Ac>
893void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
894                  const float* input2_data, const Dims<4>& input2_dims,
895                  float* output_data, const Dims<4>& output_dims) {
896  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
897
898  NdArrayDesc<4> desc1;
899  NdArrayDesc<4> desc2;
900  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
901
902  // In Tensorflow, the dimensions are canonically named (batch_number, row,
903  // col, channel), with extents (batches, height, width, depth), with the
904  // trailing dimension changing most rapidly (channels has the smallest stride,
905  // typically 1 element).
906  //
907  // In generated C code, we store arrays with the dimensions reversed. The
908  // first dimension has smallest stride.
909  //
910  // We name our variables by their Tensorflow convention, but generate C code
911  // nesting loops such that the innermost loop has the smallest stride for the
912  // best cache behavior.
913  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
914    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
915      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
916        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
917          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
918              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
919              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
920        }
921      }
922    }
923  }
924}
925
926inline void BroadcastAdd(int left_shift, const uint8* input1_data,
927                         const Dims<4>& input1_dims, int32 input1_offset,
928                         int32 input1_multiplier, int input1_shift,
929                         const uint8* input2_data, const Dims<4>& input2_dims,
930                         int32 input2_offset, int32 input2_multiplier,
931                         int input2_shift, int32 output_offset,
932                         int32 output_multiplier, int output_shift,
933                         int32 output_activation_min,
934                         int32 output_activation_max, uint8* output_data,
935                         const Dims<4>& output_dims) {
936  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
937
938  NdArrayDesc<4> desc1;
939  NdArrayDesc<4> desc2;
940  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
941
942  // In Tensorflow, the dimensions are canonically named (batch_number, row,
943  // col, channel), with extents (batches, height, width, depth), with the
944  // trailing dimension changing most rapidly (channels has the smallest stride,
945  // typically 1 element).
946  //
947  // In generated C code, we store arrays with the dimensions reversed. The
948  // first dimension has smallest stride.
949  //
950  // We name our variables by their Tensorflow convention, but generate C code
951  // nesting loops such that the innermost loop has the smallest stride for the
952  // best cache behavior.
953  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
954    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
955      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
956        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
957          const int32 input1_val =
958              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
959          const int32 input2_val =
960              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
961          const int32 shifted_input1_val = input1_val * (1 << left_shift);
962          const int32 shifted_input2_val = input2_val * (1 << left_shift);
963          const int32 scaled_input1_val =
964              MultiplyByQuantizedMultiplierSmallerThanOne(
965                  shifted_input1_val, input1_multiplier, input1_shift);
966          const int32 scaled_input2_val =
967              MultiplyByQuantizedMultiplierSmallerThanOne(
968                  shifted_input2_val, input2_multiplier, input2_shift);
969          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
970          const int32 raw_output =
971              MultiplyByQuantizedMultiplierSmallerThanOne(
972                  raw_sum, output_multiplier, output_shift) +
973              output_offset;
974          const int32 clamped_output =
975              std::min(output_activation_max,
976                       std::max(output_activation_min, raw_output));
977          output_data[Offset(output_dims, c, x, y, b)] =
978              static_cast<uint8>(clamped_output);
979        }
980      }
981    }
982  }
983}
984
985template <FusedActivationFunctionType Ac>
986inline void BroadcastAdd(int left_shift, const uint8* input1_data,
987                         const Dims<4>& input1_dims, int32 input1_offset,
988                         int32 input1_multiplier, int input1_shift,
989                         const uint8* input2_data, const Dims<4>& input2_dims,
990                         int32 input2_offset, int32 input2_multiplier,
991                         int input2_shift, int32 output_offset,
992                         int32 output_multiplier, int output_shift,
993                         int32 output_activation_min,
994                         int32 output_activation_max, uint8* output_data,
995                         const Dims<4>& output_dims) {
996  static_assert(Ac == FusedActivationFunctionType::kNone ||
997                    Ac == FusedActivationFunctionType::kRelu ||
998                    Ac == FusedActivationFunctionType::kRelu6 ||
999                    Ac == FusedActivationFunctionType::kRelu1,
1000                "");
1001  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1002  if (Ac == FusedActivationFunctionType::kNone) {
1003    TFLITE_DCHECK_EQ(output_activation_min, 0);
1004    TFLITE_DCHECK_EQ(output_activation_max, 255);
1005  }
1006  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
1007               input1_multiplier, input1_shift, input2_data, input2_dims,
1008               input2_offset, input2_multiplier, input2_shift, output_offset,
1009               output_multiplier, output_shift, output_activation_min,
1010               output_activation_max, output_data, output_dims);
1011}
1012
1013inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
1014                const float* input2_data, const Dims<4>& input2_dims,
1015                float output_activation_min, float output_activation_max,
1016                float* output_data, const Dims<4>& output_dims) {
1017  const int batches =
1018      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
1019  const int height =
1020      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
1021  const int width =
1022      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
1023  const int depth =
1024      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
1025  for (int b = 0; b < batches; ++b) {
1026    for (int y = 0; y < height; ++y) {
1027      for (int x = 0; x < width; ++x) {
1028        for (int c = 0; c < depth; ++c) {
1029          output_data[Offset(output_dims, c, x, y, b)] =
1030              ActivationFunctionWithMinMax(
1031                  input1_data[Offset(input1_dims, c, x, y, b)] *
1032                      input2_data[Offset(input2_dims, c, x, y, b)],
1033                  output_activation_min, output_activation_max);
1034        }
1035      }
1036    }
1037  }
1038}
1039
1040// legacy, for compatibility with old checked-in code
1041template <FusedActivationFunctionType Ac>
1042void Mul(const float* input1_data, const Dims<4>& input1_dims,
1043         const float* input2_data, const Dims<4>& input2_dims,
1044         float* output_data, const Dims<4>& output_dims) {
1045  float output_activation_min, output_activation_max;
1046  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1047
1048  Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
1049      output_activation_max, output_data, output_dims);
1050}
1051
1052// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
1053// dimensionality if the runtime code does a single loop over one dimension
1054// that handles broadcasting as the base case. The code generator would then
1055// generate max(D1, D2) nested for loops.
1056template <FusedActivationFunctionType Ac>
1057void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
1058                  const float* input2_data, const Dims<4>& input2_dims,
1059                  float* output_data, const Dims<4>& output_dims) {
1060  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
1061
1062  NdArrayDesc<4> desc1;
1063  NdArrayDesc<4> desc2;
1064  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
1065
1066  // In Tensorflow, the dimensions are canonically named (batch_number, row,
1067  // col, channel), with extents (batches, height, width, depth), with the
1068  // trailing dimension changing most rapidly (channels has the smallest
1069  // stride, typically 1 element).
1070  //
1071  // In generated C code, we store arrays with the dimensions reversed. The
1072  // first dimension has smallest stride.
1073  //
1074  // We name our variables by their Tensorflow convention, but generate C code
1075  // nesting loops such that the innermost loop has the smallest stride for
1076  // the best cache behavior.
1077  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
1078    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
1079      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
1080        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
1081          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
1082              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
1083              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
1084        }
1085      }
1086    }
1087  }
1088}
1089
1090inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
1091                         int32 input1_offset, const uint8* input2_data,
1092                         const Dims<4>& input2_dims, int32 input2_offset,
1093                         int32 output_offset, int32 output_multiplier,
1094                         int output_shift, int32 output_activation_min,
1095                         int32 output_activation_max, uint8* output_data,
1096                         const Dims<4>& output_dims) {
1097  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
1098
1099  NdArrayDesc<4> desc1;
1100  NdArrayDesc<4> desc2;
1101  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
1102
1103  // In Tensorflow, the dimensions are canonically named (batch_number, row,
1104  // col, channel), with extents (batches, height, width, depth), with the
1105  // trailing dimension changing most rapidly (channels has the smallest
1106  // stride, typically 1 element).
1107  //
1108  // In generated C code, we store arrays with the dimensions reversed. The
1109  // first dimension has smallest stride.
1110  //
1111  // We name our variables by their Tensorflow convention, but generate C code
1112  // nesting loops such that the innermost loop has the smallest stride for
1113  // the best cache behavior.
1114  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
1115    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
1116      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
1117        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
1118          const int32 input1_val =
1119              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
1120          const int32 input2_val =
1121              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
1122          const int32 unclamped_result =
1123              output_offset +
1124              MultiplyByQuantizedMultiplierSmallerThanOne(
1125                  input1_val * input2_val, output_multiplier, output_shift);
1126          const int32 clamped_output =
1127              std::min(output_activation_max,
1128                       std::max(output_activation_min, unclamped_result));
1129          output_data[Offset(output_dims, c, x, y, b)] =
1130              static_cast<uint8>(clamped_output);
1131        }
1132      }
1133    }
1134  }
1135}
1136
1137// legacy, for compatibility with old checked-in code
1138template <FusedActivationFunctionType Ac>
1139inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
1140                         int32 input1_offset, const uint8* input2_data,
1141                         const Dims<4>& input2_dims, int32 input2_offset,
1142                         int32 output_offset, int32 output_multiplier,
1143                         int output_shift, int32 output_activation_min,
1144                         int32 output_activation_max, uint8* output_data,
1145                         const Dims<4>& output_dims) {
1146  BroadcastMul(input1_data, input1_dims, input1_offset, input2_data,
1147               input2_dims, input2_offset, output_offset, output_multiplier,
1148               output_shift, output_activation_min, output_activation_max,
1149               output_data, output_dims);
1150}
1151
1152template <FusedActivationFunctionType Ac, typename Scalar>
1153void Concatenation(int concat_dim, const Scalar* const* input_data,
1154                   const Dims<4>* const* input_dims, int inputs_count,
1155                   Scalar* output_data, const Dims<4>& output_dims) {
1156  TFLITE_DCHECK_GT(inputs_count, 1);
1157  int concat_size = 0;
1158  for (int i = 0; i < inputs_count; i++) {
1159    for (int j = 0; j < 4; j++) {
1160      if (j != concat_dim) {
1161        MatchingArraySize(*input_dims[i], j, output_dims, j);
1162      }
1163    }
1164    concat_size += ArraySize(*input_dims[i], concat_dim);
1165  }
1166  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
1167  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
1168  int outer_size = 1;
1169  for (int i = concat_dim + 1; i < 4; i++) {
1170    outer_size *= output_dims.sizes[i];
1171  }
1172  Scalar* output_ptr = output_data;
1173  for (int k = 0; k < outer_size; k++) {
1174    for (int i = 0; i < inputs_count; ++i) {
1175      const int copy_size =
1176          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
1177      memcpy(output_ptr, input_data[i] + k * copy_size,
1178             copy_size * sizeof(Scalar));
1179      output_ptr += copy_size;
1180    }
1181  }
1182}
1183
1184template <FusedActivationFunctionType Ac, typename Scalar>
1185void DepthConcatenation(const Scalar* const* input_data,
1186                        const Dims<4>* const* input_dims, int inputs_count,
1187                        Scalar* output_data, const Dims<4>& output_dims) {
1188  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
1189                            output_data, output_dims);
1190}
1191
1192inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
1193                     const float* prev_activ_data,
1194                     const Dims<4>& prev_activ_dims, const float* weights_data,
1195                     const Dims<4>& weights_dims, const float* bias_data,
1196                     const Dims<4>& bias_dims, const float* prev_state_data,
1197                     const Dims<4>& prev_state_dims, float* output_state_data,
1198                     const Dims<4>& output_state_dims, float* output_activ_data,
1199                     const Dims<4>& output_activ_dims, float* concat_temp_data,
1200                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
1201                     const Dims<4>& activ_temp_dims) {
1202  const int batches =
1203      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
1204                        output_state_dims, 3, output_activ_dims, 3);
1205  const int height =
1206      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
1207                        output_state_dims, 2, output_activ_dims, 2);
1208  const int width =
1209      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
1210                        output_state_dims, 1, output_activ_dims, 1);
1211  TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
1212  TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
1213  const int input_depth = ArraySize(input_dims, 0);
1214  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
1215  const int total_input_depth = prev_activ_depth + input_depth;
1216  TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
1217  TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3),
1218                  1);
1219  const int intern_activ_depth =
1220      MatchingArraySize(weights_dims, 1, bias_dims, 0);
1221  TFLITE_CHECK_EQ(intern_activ_depth % 4, 0);
1222  const int output_depth =
1223      MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
1224                        output_state_dims, 0, output_activ_dims, 0);
1225  TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
1226
1227  // Concatenate prev_activ and input data together
1228  std::vector<float const*> concat_input_arrays_data;
1229  std::vector<Dims<4> const*> concat_input_arrays_dims;
1230  concat_input_arrays_data.push_back(input_data);
1231  concat_input_arrays_data.push_back(prev_activ_data);
1232  concat_input_arrays_dims.push_back(&input_dims);
1233  concat_input_arrays_dims.push_back(&prev_activ_dims);
1234  Concatenation<FusedActivationFunctionType::kNone, float>(
1235      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
1236      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
1237
1238  // Fully connected
1239  FullyConnected<FusedActivationFunctionType::kNone>(
1240      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
1241      bias_dims, activ_temp_data, activ_temp_dims);
1242
1243  // Memory state update (the LSTM "guts")
1244  for (int b = 0; b < batches; ++b) {
1245    for (int w = 0; w < width; ++w) {
1246      for (int h = 0; h < height; ++h) {
1247        for (int c = 0; c < output_depth; ++c) {
1248          const float input_gate =
1249              1.f /
1250              (1.f + std::exp(-activ_temp_data[Offset(
1251                         activ_temp_dims, 0 * output_depth + c, w, h, b)]));
1252          const float new_input = std::tanh(activ_temp_data[Offset(
1253              activ_temp_dims, 1 * output_depth + c, w, h, b)]);
1254          const float forget_gate =
1255              1.f /
1256              (1.f + std::exp(-activ_temp_data[Offset(
1257                         activ_temp_dims, 2 * output_depth + c, w, h, b)]));
1258          const float output_gate =
1259              1.f /
1260              (1.f + std::exp(-activ_temp_data[Offset(
1261                         activ_temp_dims, 3 * output_depth + c, w, h, b)]));
1262          const float new_state =
1263              input_gate * new_input +
1264              forget_gate *
1265                  prev_state_data[Offset(prev_state_dims, c, w, h, b)];
1266          output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state;
1267          output_activ_data[Offset(output_activ_dims, c, w, h, b)] =
1268              output_gate * std::tanh(new_state);
1269        }
1270      }
1271    }
1272  }
1273}
1274
1275template <FusedActivationFunctionType Ac, typename Scalar>
1276void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
1277                     int outputs_count, Scalar* const* output_data,
1278                     const Dims<4>* const* output_dims) {
1279  TFLITE_DCHECK_GE(outputs_count, 1);
1280  for (int i = 0; i < outputs_count; i++) {
1281    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
1282    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
1283    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
1284  }
1285  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
1286  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
1287  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
1288  // for now we dont have a model with a TensorFlowSplit
1289  // with fused activation function.
1290  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
1291  for (int b = 0; b < batches; ++b) {
1292    for (int y = 0; y < height; ++y) {
1293      for (int x = 0; x < width; ++x) {
1294        int in_c = 0;
1295        for (int i = 0; i < outputs_count; ++i) {
1296          const int depth = ArraySize(*output_dims[i], 0);
1297          for (int c = 0; c < depth; ++c) {
1298            output_data[i][Offset(*output_dims[i], c, x, y, b)] =
1299                input_data[Offset(input_dims, in_c, x, y, b)];
1300            in_c++;
1301          }
1302        }
1303        TFLITE_DCHECK(in_c == ArraySize(input_dims, 0));
1304      }
1305    }
1306  }
1307}
1308
1309// TODO(benoitjacob) make this a proper reference impl without Eigen!
1310template <typename Scalar>
1311using MatrixMap = typename std::conditional<
1312    std::is_const<Scalar>::value,
1313    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
1314                                   Eigen::Dynamic, Eigen::Dynamic>>,
1315    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
1316
1317template <typename Scalar, int N>
1318MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
1319                                                const Dims<N>& dims) {
1320  const int rows = dims.sizes[0];
1321  int cols = 1;
1322  for (int d = 1; d < N; d++) {
1323    cols *= dims.sizes[d];
1324  }
1325  return MatrixMap<Scalar>(data, rows, cols);
1326}
1327
1328template <typename Scalar, int N>
1329MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
1330                                               const Dims<N>& dims) {
1331  const int cols = dims.sizes[N - 1];
1332  int rows = 1;
1333  for (int d = 0; d < N - 1; d++) {
1334    rows *= dims.sizes[d];
1335  }
1336  return MatrixMap<Scalar>(data, rows, cols);
1337}
1338
1339inline int NodeOffset(int b, int h, int w, int height, int width) {
1340  return (b * height + h) * width + w;
1341}
1342
1343inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
1344                        int stride_width, int stride_height, int pad_width,
1345                        int pad_height, int filter_width, int filter_height,
1346                        float output_activation_min,
1347                        float output_activation_max, float* output_data,
1348                        const Dims<4>& output_dims) {
1349  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1350  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1351  const int input_height = ArraySize(input_dims, 2);
1352  const int input_width = ArraySize(input_dims, 1);
1353  const int output_height = ArraySize(output_dims, 2);
1354  const int output_width = ArraySize(output_dims, 1);
1355  for (int batch = 0; batch < batches; ++batch) {
1356    for (int out_y = 0; out_y < output_height; ++out_y) {
1357      for (int out_x = 0; out_x < output_width; ++out_x) {
1358        for (int channel = 0; channel < depth; ++channel) {
1359          const int in_x_origin = (out_x * stride_width) - pad_width;
1360          const int in_y_origin = (out_y * stride_height) - pad_height;
1361          // Compute the boundaries of the filter region clamped so as to
1362          // ensure that the filter window fits in the input array.
1363          const int filter_x_start = std::max(0, -in_x_origin);
1364          const int filter_x_end =
1365              std::min(filter_width, input_width - in_x_origin);
1366          const int filter_y_start = std::max(0, -in_y_origin);
1367          const int filter_y_end =
1368              std::min(filter_height, input_height - in_y_origin);
1369          float total = 0.f;
1370          float filter_count = 0;
1371          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1372               ++filter_y) {
1373            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1374                 ++filter_x) {
1375              const int in_x = in_x_origin + filter_x;
1376              const int in_y = in_y_origin + filter_y;
1377              total +=
1378                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1379              filter_count++;
1380            }
1381          }
1382          const float average = total / filter_count;
1383          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1384              ActivationFunctionWithMinMax(average, output_activation_min,
1385                                           output_activation_max);
1386        }
1387      }
1388    }
1389  }
1390}
1391
1392// legacy, for compatibility with old checked-in code
1393template <FusedActivationFunctionType Ac>
1394void AveragePool(const float* input_data, const Dims<4>& input_dims,
1395                 int stride_width, int stride_height, int pad_width,
1396                 int pad_height, int filter_width, int filter_height,
1397                 float* output_data, const Dims<4>& output_dims) {
1398  float output_activation_min, output_activation_max;
1399  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1400  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
1401              pad_height, filter_width, filter_height, output_activation_min,
1402              output_activation_max, output_data, output_dims);
1403}
1404
1405// legacy, for compatibility with old checked-in code
1406template <FusedActivationFunctionType Ac>
1407void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride,
1408                 int pad_width, int pad_height, int filter_width,
1409                 int filter_height, float* output_data,
1410                 const Dims<4>& output_dims) {
1411  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1412                  filter_width, filter_height, output_data, output_dims);
1413}
1414
1415inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
1416                        int stride_width, int stride_height, int pad_width,
1417                        int pad_height, int filter_width, int filter_height,
1418                        int32 output_activation_min,
1419                        int32 output_activation_max, uint8* output_data,
1420                        const Dims<4>& output_dims) {
1421  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1422  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1423  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1424  const int input_height = ArraySize(input_dims, 2);
1425  const int input_width = ArraySize(input_dims, 1);
1426  const int output_height = ArraySize(output_dims, 2);
1427  const int output_width = ArraySize(output_dims, 1);
1428  for (int batch = 0; batch < batches; ++batch) {
1429    for (int out_y = 0; out_y < output_height; ++out_y) {
1430      for (int out_x = 0; out_x < output_width; ++out_x) {
1431        for (int channel = 0; channel < depth; ++channel) {
1432          const int in_x_origin = (out_x * stride_width) - pad_width;
1433          const int in_y_origin = (out_y * stride_height) - pad_height;
1434          // Compute the boundaries of the filter region clamped so as to
1435          // ensure that the filter window fits in the input array.
1436          const int filter_x_start = std::max(0, -in_x_origin);
1437          const int filter_x_end =
1438              std::min(filter_width, input_width - in_x_origin);
1439          const int filter_y_start = std::max(0, -in_y_origin);
1440          const int filter_y_end =
1441              std::min(filter_height, input_height - in_y_origin);
1442          int32 acc = 0;
1443          int filter_count = 0;
1444          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1445               ++filter_y) {
1446            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1447                 ++filter_x) {
1448              const int in_x = in_x_origin + filter_x;
1449              const int in_y = in_y_origin + filter_y;
1450              acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1451              filter_count++;
1452            }
1453          }
1454          acc = (acc + filter_count / 2) / filter_count;
1455          acc = std::max(acc, output_activation_min);
1456          acc = std::min(acc, output_activation_max);
1457          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1458              static_cast<uint8>(acc);
1459        }
1460      }
1461    }
1462  }
1463}
1464
1465// legacy, for compatibility with old checked-in code
1466template <FusedActivationFunctionType Ac>
1467void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
1468                 int stride_width, int stride_height, int pad_width,
1469                 int pad_height, int filter_width, int filter_height,
1470                 int32 output_activation_min, int32 output_activation_max,
1471                 uint8* output_data, const Dims<4>& output_dims) {
1472  static_assert(Ac == FusedActivationFunctionType::kNone ||
1473                    Ac == FusedActivationFunctionType::kRelu ||
1474                    Ac == FusedActivationFunctionType::kRelu6 ||
1475                    Ac == FusedActivationFunctionType::kRelu1,
1476                "");
1477  if (Ac == FusedActivationFunctionType::kNone) {
1478    TFLITE_DCHECK_EQ(output_activation_min, 0);
1479    TFLITE_DCHECK_EQ(output_activation_max, 255);
1480  }
1481  AveragePool(input_data, input_dims, stride_width, stride_height, pad_width,
1482              pad_height, filter_width, filter_height, output_activation_min,
1483              output_activation_max, output_data, output_dims);
1484}
1485
1486// legacy, for compatibility with old checked-in code
1487template <FusedActivationFunctionType Ac>
1488void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride,
1489                 int pad_width, int pad_height, int filter_width,
1490                 int filter_height, int32 output_activation_min,
1491                 int32 output_activation_max, uint8* output_data,
1492                 const Dims<4>& output_dims) {
1493  AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1494                  filter_width, filter_height, output_activation_min,
1495                  output_activation_max, output_data, output_dims);
1496}
1497
1498inline void L2Pool(const float* input_data, const Dims<4>& input_dims,
1499                   int stride_width, int stride_height, int pad_width,
1500                   int pad_height, int filter_width, int filter_height,
1501                   float output_activation_min, float output_activation_max,
1502                   float* output_data, const Dims<4>& output_dims) {
1503  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1504  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1505  const int input_height = ArraySize(input_dims, 2);
1506  const int input_width = ArraySize(input_dims, 1);
1507  const int output_height = ArraySize(output_dims, 2);
1508  const int output_width = ArraySize(output_dims, 1);
1509  for (int batch = 0; batch < batches; ++batch) {
1510    for (int out_y = 0; out_y < output_height; ++out_y) {
1511      for (int out_x = 0; out_x < output_width; ++out_x) {
1512        for (int channel = 0; channel < depth; ++channel) {
1513          const int in_x_origin = (out_x * stride_width) - pad_width;
1514          const int in_y_origin = (out_y * stride_height) - pad_height;
1515          // Compute the boundaries of the filter region clamped so as to
1516          // ensure that the filter window fits in the input array.
1517          const int filter_x_start = std::max(0, -in_x_origin);
1518          const int filter_x_end =
1519              std::min(filter_width, input_width - in_x_origin);
1520          const int filter_y_start = std::max(0, -in_y_origin);
1521          const int filter_y_end =
1522              std::min(filter_height, input_height - in_y_origin);
1523          float sum_squares = 0.f;
1524          int filter_count = 0;
1525          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1526               ++filter_y) {
1527            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1528                 ++filter_x) {
1529              const int in_x = in_x_origin + filter_x;
1530              const int in_y = in_y_origin + filter_y;
1531              const float val =
1532                  input_data[Offset(input_dims, channel, in_x, in_y, batch)];
1533              sum_squares += val * val;
1534              filter_count++;
1535            }
1536          }
1537          const float l2pool_result = std::sqrt(sum_squares / filter_count);
1538          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1539              ActivationFunctionWithMinMax(l2pool_result, output_activation_min,
1540                                           output_activation_max);
1541        }
1542      }
1543    }
1544  }
1545}
1546
1547// legacy, for compatibility with old checked-in code
1548template <FusedActivationFunctionType Ac>
1549void L2Pool(const float* input_data, const Dims<4>& input_dims,
1550            int stride_width, int stride_height, int pad_width, int pad_height,
1551            int filter_width, int filter_height, float* output_data,
1552            const Dims<4>& output_dims) {
1553  float output_activation_min, output_activation_max;
1554  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1555
1556  L2Pool(input_data, input_dims, stride_width, stride_height, pad_width,
1557         pad_height, filter_width, filter_height, output_activation_min,
1558         output_activation_max, output_data, output_dims);
1559}
1560
1561// legacy, for compatibility with old checked-in code
1562template <FusedActivationFunctionType Ac>
1563void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride,
1564            int pad_width, int pad_height, int filter_width, int filter_height,
1565            float* output_data, const Dims<4>& output_dims) {
1566  L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1567             filter_width, filter_height, output_data, output_dims);
1568}
1569
1570inline void MaxPool(const float* input_data, const Dims<4>& input_dims,
1571                    int stride_width, int stride_height, int pad_width,
1572                    int pad_height, int filter_width, int filter_height,
1573                    float output_activation_min, float output_activation_max,
1574                    float* output_data, const Dims<4>& output_dims) {
1575  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1576  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1577  const int input_height = ArraySize(input_dims, 2);
1578  const int input_width = ArraySize(input_dims, 1);
1579  const int output_height = ArraySize(output_dims, 2);
1580  const int output_width = ArraySize(output_dims, 1);
1581  for (int batch = 0; batch < batches; ++batch) {
1582    for (int out_y = 0; out_y < output_height; ++out_y) {
1583      for (int out_x = 0; out_x < output_width; ++out_x) {
1584        for (int channel = 0; channel < depth; ++channel) {
1585          const int in_x_origin = (out_x * stride_width) - pad_width;
1586          const int in_y_origin = (out_y * stride_height) - pad_height;
1587          // Compute the boundaries of the filter region clamped so as to
1588          // ensure that the filter window fits in the input array.
1589          const int filter_x_start = std::max(0, -in_x_origin);
1590          const int filter_x_end =
1591              std::min(filter_width, input_width - in_x_origin);
1592          const int filter_y_start = std::max(0, -in_y_origin);
1593          const int filter_y_end =
1594              std::min(filter_height, input_height - in_y_origin);
1595          float max = std::numeric_limits<float>::lowest();
1596          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1597               ++filter_y) {
1598            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1599                 ++filter_x) {
1600              const int in_x = in_x_origin + filter_x;
1601              const int in_y = in_y_origin + filter_y;
1602              max = std::max(
1603                  max,
1604                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
1605            }
1606          }
1607          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1608              ActivationFunctionWithMinMax(max, output_activation_min,
1609                                           output_activation_max);
1610        }
1611      }
1612    }
1613  }
1614}
1615
1616// legacy, for compatibility with old checked-in code
1617template <FusedActivationFunctionType Ac>
1618void MaxPool(const float* input_data, const Dims<4>& input_dims,
1619             int stride_width, int stride_height, int pad_width, int pad_height,
1620             int filter_width, int filter_height, float* output_data,
1621             const Dims<4>& output_dims) {
1622  float output_activation_min, output_activation_max;
1623  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
1624  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
1625          pad_height, filter_width, filter_height, output_activation_min,
1626          output_activation_max, output_data, output_dims);
1627}
1628
1629// legacy, for compatibility with old checked-in code
1630template <FusedActivationFunctionType Ac>
1631void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride,
1632             int pad_width, int pad_height, int filter_width, int filter_height,
1633             float* output_data, const Dims<4>& output_dims) {
1634  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1635              filter_width, filter_height, output_data, output_dims);
1636}
1637
1638inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
1639                    int stride_width, int stride_height, int pad_width,
1640                    int pad_height, int filter_width, int filter_height,
1641                    int32 output_activation_min, int32 output_activation_max,
1642                    uint8* output_data, const Dims<4>& output_dims) {
1643  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
1644  TFLITE_DCHECK_GE(output_activation_min, 0);
1645  TFLITE_DCHECK_LE(output_activation_max, 255);
1646  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1647  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1648  const int input_height = ArraySize(input_dims, 2);
1649  const int input_width = ArraySize(input_dims, 1);
1650  const int output_height = ArraySize(output_dims, 2);
1651  const int output_width = ArraySize(output_dims, 1);
1652  for (int batch = 0; batch < batches; ++batch) {
1653    for (int out_y = 0; out_y < output_height; ++out_y) {
1654      for (int out_x = 0; out_x < output_width; ++out_x) {
1655        for (int channel = 0; channel < depth; ++channel) {
1656          const int in_x_origin = (out_x * stride_width) - pad_width;
1657          const int in_y_origin = (out_y * stride_height) - pad_height;
1658          // Compute the boundaries of the filter region clamped so as to
1659          // ensure that the filter window fits in the input array.
1660          const int filter_x_start = std::max(0, -in_x_origin);
1661          const int filter_x_end =
1662              std::min(filter_width, input_width - in_x_origin);
1663          const int filter_y_start = std::max(0, -in_y_origin);
1664          const int filter_y_end =
1665              std::min(filter_height, input_height - in_y_origin);
1666          uint8 max = 0;
1667          for (int filter_y = filter_y_start; filter_y < filter_y_end;
1668               ++filter_y) {
1669            for (int filter_x = filter_x_start; filter_x < filter_x_end;
1670                 ++filter_x) {
1671              const int in_x = in_x_origin + filter_x;
1672              const int in_y = in_y_origin + filter_y;
1673              max = std::max(
1674                  max,
1675                  input_data[Offset(input_dims, channel, in_x, in_y, batch)]);
1676            }
1677          }
1678          max = std::max<uint8>(max, output_activation_min);
1679          max = std::min<uint8>(max, output_activation_max);
1680          output_data[Offset(output_dims, channel, out_x, out_y, batch)] =
1681              static_cast<uint8>(max);
1682        }
1683      }
1684    }
1685  }
1686}
1687
1688// legacy, for compatibility with old checked-in code
1689template <FusedActivationFunctionType Ac>
1690void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
1691             int stride_width, int stride_height, int pad_width, int pad_height,
1692             int filter_width, int filter_height, int32 output_activation_min,
1693             int32 output_activation_max, uint8* output_data,
1694             const Dims<4>& output_dims) {
1695  static_assert(Ac == FusedActivationFunctionType::kNone ||
1696                    Ac == FusedActivationFunctionType::kRelu ||
1697                    Ac == FusedActivationFunctionType::kRelu6 ||
1698                    Ac == FusedActivationFunctionType::kRelu1,
1699                "");
1700  if (Ac == FusedActivationFunctionType::kNone) {
1701    TFLITE_DCHECK_EQ(output_activation_min, 0);
1702    TFLITE_DCHECK_EQ(output_activation_max, 255);
1703  }
1704  MaxPool(input_data, input_dims, stride_width, stride_height, pad_width,
1705          pad_height, filter_width, filter_height, output_activation_min,
1706          output_activation_max, output_data, output_dims);
1707}
1708
1709// legacy, for compatibility with old checked-in code
1710template <FusedActivationFunctionType Ac>
1711void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride,
1712             int pad_width, int pad_height, int filter_width, int filter_height,
1713             int32 output_activation_min, int32 output_activation_max,
1714             uint8* output_data, const Dims<4>& output_dims) {
1715  MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height,
1716              filter_width, filter_height, output_activation_min,
1717              output_activation_max, output_data, output_dims);
1718}
1719
1720inline void LocalResponseNormalization(const float* input_data,
1721                                       const Dims<4>& input_dims, int range,
1722                                       float bias, float alpha, float beta,
1723                                       float* output_data,
1724                                       const Dims<4>& output_dims) {
1725  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1726  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1727  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1728  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1729
1730  for (int b = 0; b < batches; ++b) {
1731    for (int y = 0; y < height; ++y) {
1732      for (int x = 0; x < width; ++x) {
1733        for (int c = 0; c < depth; ++c) {
1734          const int begin_input_c = std::max(0, c - range);
1735          const int end_input_c = std::min(depth, c + range);
1736          float accum = 0.f;
1737          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
1738            const float input_val =
1739                input_data[Offset(input_dims, input_c, x, y, b)];
1740            accum += input_val * input_val;
1741          }
1742          const float multiplier = std::pow(bias + alpha * accum, -beta);
1743          output_data[Offset(output_dims, c, x, y, b)] =
1744              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
1745        }
1746      }
1747    }
1748  }
1749}
1750
1751inline void Softmax(const float* input_data, const Dims<4>& input_dims,
1752                    float beta, float* output_data,
1753                    const Dims<4>& output_dims) {
1754  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1755  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1756  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1757  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1758
1759  for (int b = 0; b < batches; ++b) {
1760    for (int y = 0; y < height; ++y) {
1761      for (int x = 0; x < width; ++x) {
1762        // Find max element value which we'll use to ensure numerical stability
1763        // taking advantage of the following equality:
1764        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
1765        float max = std::numeric_limits<float>::lowest();
1766        for (int c = 0; c < depth; ++c) {
1767          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
1768        }
1769
1770        // Compute sum.
1771        float sum = 0.f;
1772        for (int c = 0; c < depth; ++c) {
1773          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
1774                          beta);
1775        }
1776
1777        // Compute result.
1778        for (int c = 0; c < depth; ++c) {
1779          output_data[Offset(output_dims, c, x, y, b)] =
1780              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
1781                       beta) /
1782              sum;
1783        }
1784      }
1785    }
1786  }
1787}
1788
1789inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
1790                    int32 input_beta_multiplier, int32 input_beta_left_shift,
1791                    int diff_min, uint8* output_data,
1792                    const Dims<4>& output_dims) {
1793  // The representation chosen for the input to the exp() function is Q5.26.
1794  // We need to leave extra space since values that we skip might be as large as
1795  // -32 before multiplying by input_beta_multiplier, and therefore as large as
1796  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
1797  // accumulation, but exp(-16) definitely is.
1798  static const int kScaledDiffIntegerBits = 5;
1799  static const int kAccumulationIntegerBits = 12;
1800  using FixedPointScaledDiff =
1801      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
1802  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
1803  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
1804
1805  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1806  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1807  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1808  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1809
1810  for (int b = 0; b < batches; ++b) {
1811    for (int x = 0; x < width; ++x) {
1812      for (int y = 0; y < height; ++y) {
1813        uint8 max_in_row = 0;
1814        for (int c = 0; c < depth; ++c) {
1815          max_in_row =
1816              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
1817        }
1818
1819        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
1820        for (int c = 0; c < depth; ++c) {
1821          int32 input_diff =
1822              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
1823              max_in_row;
1824          if (input_diff >= diff_min) {
1825            const int32 input_diff_rescaled =
1826                MultiplyByQuantizedMultiplierGreaterThanOne(
1827                    input_diff, input_beta_multiplier, input_beta_left_shift);
1828            const FixedPointScaledDiff scaled_diff_f8 =
1829                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
1830            sum_of_exps =
1831                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
1832                                  exp_on_negative_values(scaled_diff_f8));
1833          }
1834        }
1835
1836        int32 fixed_sum_of_exps = sum_of_exps.raw();
1837        int headroom_plus_one =
1838            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
1839        // This is the number of bits to the left of the binary point above 1.0.
1840        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
1841        // no later adjustment will be needed.
1842        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
1843        int32 shifted_sum_minus_one = static_cast<int32>(
1844            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
1845            (static_cast<uint32>(1) << 31));
1846
1847        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
1848            FixedPoint0::FromRaw(shifted_sum_minus_one));
1849
1850        for (int c = 0; c < depth; ++c) {
1851          int32 input_diff =
1852              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
1853              max_in_row;
1854          if (input_diff >= diff_min) {
1855            const int32 input_diff_rescaled =
1856                MultiplyByQuantizedMultiplierGreaterThanOne(
1857                    input_diff, input_beta_multiplier, input_beta_left_shift);
1858            const FixedPointScaledDiff scaled_diff_f8 =
1859                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
1860
1861            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
1862            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
1863                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
1864
1865            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
1866                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
1867
1868          } else {
1869            output_data[Offset(output_dims, c, x, y, b)] = 0;
1870          }
1871        }
1872      }
1873    }
1874  }
1875}
1876
1877inline void Logistic(const float* input_data, const Dims<4>& input_dims,
1878                     float* output_data, const Dims<4>& output_dims) {
1879  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1880  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1881  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1882  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1883  for (int b = 0; b < batches; ++b) {
1884    for (int y = 0; y < height; ++y) {
1885      for (int x = 0; x < width; ++x) {
1886        for (int c = 0; c < depth; ++c) {
1887          float val = input_data[Offset(input_dims, c, x, y, b)];
1888          float result = 1.f / (1.f + std::exp(-val));
1889          output_data[Offset(output_dims, c, x, y, b)] = result;
1890        }
1891      }
1892    }
1893  }
1894}
1895
1896inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
1897                     int32 input_zero_point, int32 input_range_radius,
1898                     int32 input_multiplier, int input_left_shift,
1899                     uint8* output_data, const Dims<4>& output_dims) {
1900  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1901  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1902  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1903  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1904  for (int b = 0; b < batches; ++b) {
1905    for (int y = 0; y < height; ++y) {
1906      for (int x = 0; x < width; ++x) {
1907        for (int c = 0; c < depth; ++c) {
1908          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
1909          const int32 input_val_centered =
1910              static_cast<int32>(input_val_u8) - input_zero_point;
1911          uint8 output_val;
1912          if (input_val_centered <= -input_range_radius) {
1913            output_val = 0;
1914          } else if (input_val_centered >= input_range_radius) {
1915            output_val = 255;
1916          } else {
1917            const int32 input_val_rescaled =
1918                MultiplyByQuantizedMultiplierGreaterThanOne(
1919                    input_val_centered, input_multiplier, input_left_shift);
1920            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
1921            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
1922            const FixedPoint4 input_val_f4 =
1923                FixedPoint4::FromRaw(input_val_rescaled);
1924            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
1925            using gemmlowp::RoundingDivideByPOT;
1926            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
1927            if (output_val_s32 == 256) {
1928              output_val_s32 = 255;
1929            }
1930            TFLITE_DCHECK_GE(output_val_s32, 0);
1931            TFLITE_DCHECK_LE(output_val_s32, 255);
1932            output_val = static_cast<uint8>(output_val_s32);
1933          }
1934          output_data[Offset(output_dims, c, x, y, b)] = output_val;
1935        }
1936      }
1937    }
1938  }
1939}
1940
1941inline void Tanh(const float* input_data, const Dims<4>& input_dims,
1942                 float* output_data, const Dims<4>& output_dims) {
1943  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1944  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1945  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1946  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1947  for (int b = 0; b < batches; ++b) {
1948    for (int y = 0; y < height; ++y) {
1949      for (int x = 0; x < width; ++x) {
1950        for (int c = 0; c < depth; ++c) {
1951          float val = input_data[Offset(input_dims, c, x, y, b)];
1952          float result = std::tanh(val);
1953          output_data[Offset(output_dims, c, x, y, b)] = result;
1954        }
1955      }
1956    }
1957  }
1958}
1959
1960inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
1961                       int32 zero_point, double scale, float* output_data,
1962                       const Dims<4>& output_dims) {
1963  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
1964  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
1965  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
1966  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
1967  for (int b = 0; b < batches; ++b) {
1968    for (int y = 0; y < height; ++y) {
1969      for (int x = 0; x < width; ++x) {
1970        for (int c = 0; c < depth; ++c) {
1971          int32 val = input_data[Offset(input_dims, c, x, y, b)];
1972          float result = static_cast<float>(scale * (val - zero_point));
1973          output_data[Offset(output_dims, c, x, y, b)] = result;
1974        }
1975      }
1976    }
1977  }
1978}
1979
1980inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
1981                      float rmin, float rmax, float* output_data,
1982                      const Dims<4>& output_dims) {
1983  // 0 should always be a representable value. Let's assume that the initial
1984  // min,max range contains 0.
1985  TFLITE_DCHECK_LE(rmin, 0.);
1986  TFLITE_DCHECK_GE(rmax, 0.);
1987
1988  // Determine quantization parameters: zero_point, scale.
1989  using Integer = uint8;
1990  const Integer qmin = std::numeric_limits<Integer>::min();
1991  const Integer qmax = std::numeric_limits<Integer>::max();
1992  const float qmin_float = qmin;
1993  const float qmax_float = qmax;
1994  int32 zero_point = 0;
1995  float scale = 0.f;
1996  // If rmin==rmax, both must be zero per the above assertion,
1997  // so we are done.
1998  if (rmin != rmax) {
1999    // First determine the scale.
2000    scale = (rmax - rmin) / (qmax_float - qmin_float);
2001
2002    // Zero-point computation.
2003    // First the initial floating-point computation. The zero-point can be
2004    // determined from solving an affine equation for any known pair
2005    // (real value, corresponding quantized value).
2006    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
2007    // The arithmetic error on the zero point computed from either pair
2008    // will be roughly machine_epsilon * (sum of absolute values of terms)
2009    // so we want to use the variant that adds the smaller terms.
2010    const float zero_point_from_min = qmin_float - rmin / scale;
2011    const float zero_point_from_max = qmax_float - rmax / scale;
2012    const float zero_point_from_min_error =
2013        std::abs(qmin_float) + std::abs(rmin / scale);
2014    const float zero_point_from_max_error =
2015        std::abs(qmax_float) + std::abs(rmax / scale);
2016
2017    const float zero_point_float =
2018        zero_point_from_min_error < zero_point_from_max_error
2019            ? zero_point_from_min
2020            : zero_point_from_max;
2021
2022    // Now we need to nudge the zero point to be an integer
2023    // (our zero points are integer, and this is motivated by the requirement
2024    // to be able to represent the real value "0" exactly as a quantized value,
2025    // which is required in multiple places, for example in Im2col with SAME
2026    // padding).
2027    if (zero_point_float < qmin_float) {
2028      zero_point = qmin;
2029    } else if (zero_point_float > qmax_float) {
2030      zero_point = qmax;
2031    } else {
2032      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
2033    }
2034    // The zero point should always be in the range of quantized value,
2035    // [qmin, qmax].
2036    TFLITE_DCHECK_GE(zero_point, qmin);
2037    TFLITE_DCHECK_LE(zero_point, qmax);
2038  }
2039
2040  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2041  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2042  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2043  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2044  for (int b = 0; b < batches; ++b) {
2045    for (int y = 0; y < height; ++y) {
2046      for (int x = 0; x < width; ++x) {
2047        for (int c = 0; c < depth; ++c) {
2048          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
2049          const float unclamped_quantized_val =
2050              TfLiteRound(zero_point + src_val / scale);
2051          const float quantized_val = std::min(
2052              qmax_float, std::max(qmin_float, unclamped_quantized_val));
2053          const float dst_val = scale * (quantized_val - zero_point);
2054          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
2055        }
2056      }
2057    }
2058  }
2059}
2060
2061template <typename SrcT, typename DstT>
2062inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
2063                 DstT* output_data, const Dims<4>& output_dims) {
2064  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2065  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2066  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2067  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2068  for (int b = 0; b < batches; ++b) {
2069    for (int y = 0; y < height; ++y) {
2070      for (int x = 0; x < width; ++x) {
2071        for (int c = 0; c < depth; ++c) {
2072          int offset = Offset(input_dims, c, x, y, b);
2073          output_data[offset] = static_cast<DstT>(input_data[offset]);
2074        }
2075      }
2076    }
2077  }
2078}
2079
2080inline void Floor(const float* input_data, const Dims<4>& input_dims,
2081                  float* output_data, const Dims<4>& output_dims) {
2082  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2083  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2084  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2085  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2086  for (int b = 0; b < batches; ++b) {
2087    for (int y = 0; y < height; ++y) {
2088      for (int x = 0; x < width; ++x) {
2089        for (int c = 0; c < depth; ++c) {
2090          int offset = Offset(input_dims, c, x, y, b);
2091          output_data[offset] = std::floor(input_data[offset]);
2092        }
2093      }
2094    }
2095  }
2096}
2097
2098template <typename T>
2099inline void Gather(const T* input_data, const Dims<4>& input_dims,
2100                   int input_rank, const int32* coords_data,
2101                   const Dims<4>& coords_dims, T* output_data,
2102                   const Dims<4>& output_dims) {
2103  TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]);
2104  int stride = input_dims.strides[input_rank - 1];
2105  T* out = output_data;
2106
2107  for (int i = 0; i < coords_dims.sizes[0]; i++) {
2108    TFLITE_DCHECK_GE(coords_data[i], 0);
2109    TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]);
2110    const T* in = input_data + coords_data[i] * stride;
2111    memcpy(out, in, sizeof(T) * stride);
2112    out += stride;
2113  }
2114}
2115
2116inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
2117                           const int32* output_size_data,
2118                           const Dims<4>& output_size_dims, float* output_data,
2119                           const Dims<4>& output_dims) {
2120  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2121  int32 input_height = ArraySize(input_dims, 2);
2122  int32 input_width = ArraySize(input_dims, 1);
2123  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
2124
2125  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
2126  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
2127  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
2128  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
2129  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
2130  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
2131  float height_scale = static_cast<float>(input_height) / output_height;
2132  float width_scale = static_cast<float>(input_width) / output_width;
2133
2134  for (int b = 0; b < batches; ++b) {
2135    for (int y = 0; y < output_height; ++y) {
2136      float input_y = y * height_scale;
2137      int32 y0 = static_cast<int32>(std::floor(input_y));
2138      int32 y1 = std::min(y0 + 1, input_height - 1);
2139      for (int x = 0; x < output_width; ++x) {
2140        float input_x = x * width_scale;
2141        int32 x0 = static_cast<int32>(std::floor(input_x));
2142        int32 x1 = std::min(x0 + 1, input_width - 1);
2143        for (int c = 0; c < depth; ++c) {
2144          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
2145                                    (1 - (input_y - y0)) *
2146                                    (1 - (input_x - x0)) +
2147                                input_data[Offset(input_dims, c, x0, y1, b)] *
2148                                    (input_y - y0) * (1 - (input_x - x0)) +
2149                                input_data[Offset(input_dims, c, x1, y0, b)] *
2150                                    (1 - (input_y - y0)) * (input_x - x0) +
2151                                input_data[Offset(input_dims, c, x1, y1, b)] *
2152                                    (input_y - y0) * (input_x - x0);
2153          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
2154        }
2155      }
2156    }
2157  }
2158}
2159
2160template <typename T>
2161inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
2162                           const int32* block_shape_data,
2163                           const Dims<4>& block_shape_dims,
2164                           const int32* paddings_data,
2165                           const Dims<4>& paddings_dims, T* output_data,
2166                           const Dims<4>& output_dims) {
2167  const int output_batch_size = ArraySize(output_dims, 3);
2168  const int output_height = ArraySize(output_dims, 2);
2169  const int output_width = ArraySize(output_dims, 1);
2170  const int input_batch_size = ArraySize(input_dims, 3);
2171  const int input_height = ArraySize(input_dims, 2);
2172  const int input_width = ArraySize(input_dims, 1);
2173  const int depth = ArraySize(input_dims, 0);
2174  const int block_shape_height = block_shape_data[0];
2175  const int block_shape_width = block_shape_data[1];
2176  const int padding_top = paddings_data[0];
2177  const int padding_left = paddings_data[2];
2178
2179  for (int out_b = 0; out_b < output_batch_size; ++out_b) {
2180    int input_batch = out_b % input_batch_size;
2181    int shift_w = (out_b / input_batch_size) % block_shape_width;
2182    int shift_h = (out_b / input_batch_size) / block_shape_width;
2183    for (int out_h = 0; out_h < output_height; ++out_h) {
2184      for (int out_w = 0; out_w < output_width; ++out_w) {
2185        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b);
2186        if (out_h * block_shape_height < padding_top ||
2187            out_h * block_shape_height >= padding_top + input_height ||
2188            out_w * block_shape_width < padding_left ||
2189            out_w * block_shape_width >= padding_left + input_width) {
2190          memset(out, 0, depth * sizeof(T));
2191        } else {
2192          const T* in =
2193              input_data +
2194              Offset(input_dims, 0,
2195                     (out_w * block_shape_width + shift_w) - padding_left,
2196                     (out_h * block_shape_height + shift_h) - padding_top,
2197                     input_batch);
2198          memcpy(out, in, depth * sizeof(T));
2199        }
2200      }
2201    }
2202  }
2203}
2204
2205template <typename T>
2206inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
2207                           const int32* block_shape_data,
2208                           const Dims<4>& block_shape_dims, T* output_data,
2209                           const Dims<4>& output_dims) {
2210  const int output_batch_size = ArraySize(output_dims, 3);
2211  const int input_batch_size = ArraySize(input_dims, 3);
2212  const int input_height = ArraySize(input_dims, 2);
2213  const int input_width = ArraySize(input_dims, 1);
2214  const int depth = ArraySize(input_dims, 0);
2215  const int block_shape_width = block_shape_data[1];
2216  const int block_shape_height = block_shape_data[0];
2217
2218  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) {
2219    for (int in_h = 0; in_h < input_height; ++in_h) {
2220      for (int in_w = 0; in_w < input_width; ++in_w) {
2221        int out_batch = in_batch % output_batch_size;
2222        int out_w = in_w * block_shape_width +
2223                    (in_batch / output_batch_size) % block_shape_width;
2224        int out_h = in_h * block_shape_height +
2225                    (in_batch / output_batch_size) / block_shape_width;
2226        T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch);
2227        const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch);
2228        memcpy(out, in, depth * sizeof(T));
2229      }
2230    }
2231  }
2232}
2233
2234template <typename T>
2235inline void Pad(const T* input_data, const Dims<4>& input_dims,
2236                const std::vector<int>& left_paddings,
2237                const std::vector<int>& right_paddings, T* output_data,
2238                const Dims<4>& output_dims) {
2239  const int output_batch = ArraySize(output_dims, 3);
2240  const int output_height = ArraySize(output_dims, 2);
2241  const int output_width = ArraySize(output_dims, 1);
2242  const int output_depth = ArraySize(output_dims, 0);
2243
2244  const int left_b_padding = left_paddings[3];
2245  const int left_h_padding = left_paddings[2];
2246  const int left_w_padding = left_paddings[1];
2247  const int left_d_padding = left_paddings[0];
2248
2249  const int right_b_padding = right_paddings[3];
2250  const int right_h_padding = right_paddings[2];
2251  const int right_w_padding = right_paddings[1];
2252  const int right_d_padding = right_paddings[0];
2253
2254  const T* in_ptr = input_data;
2255  T* out_ptr = output_data;
2256  for (int out_b = 0; out_b < output_batch; ++out_b) {
2257    for (int out_h = 0; out_h < output_height; ++out_h) {
2258      for (int out_w = 0; out_w < output_width; ++out_w) {
2259        for (int out_d = 0; out_d < output_depth; ++out_d) {
2260          if (out_b < left_b_padding ||
2261              out_b >= output_batch - right_b_padding ||
2262              out_h < left_h_padding ||
2263              out_h >= output_height - right_h_padding ||
2264              out_w < left_w_padding ||
2265              out_w >= output_width - right_w_padding ||
2266              out_d < left_d_padding ||
2267              out_d >= output_depth - right_d_padding) {
2268            *out_ptr++ = 0;
2269          } else {
2270            *out_ptr++ = *in_ptr++;
2271          }
2272        }
2273      }
2274    }
2275  }
2276}
2277
2278template <typename T>
2279inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
2280                         int begin_mask, int end_mask,
2281                         const std::vector<int>& starts,
2282                         const std::vector<int>& stops,
2283                         const std::vector<int>& strides, T* output_data,
2284                         const Dims<4>& output_dims) {
2285  const int start_b = (begin_mask & 8) ? 0 : starts[3];
2286  const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3];
2287  const int start_h = (begin_mask & 4) ? 0 : starts[2];
2288  const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2];
2289  const int start_w = (begin_mask & 2) ? 0 : starts[1];
2290  const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1];
2291  const int start_d = (begin_mask & 1) ? 0 : starts[0];
2292  const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0];
2293
2294  T* out_ptr = output_data;
2295  for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) {
2296    for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) {
2297      for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) {
2298        for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) {
2299          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
2300        }
2301      }
2302    }
2303  }
2304}
2305
2306template <typename T>
2307inline void Slice(const T* input_data, const Dims<4>& input_dims,
2308                  const std::vector<int>& begin, const std::vector<int>& size,
2309                  T* output_data, const Dims<4>& output_dims) {
2310  // TODO(dkalenichenko): This op only supports 4D tensors.
2311  TFLITE_DCHECK_EQ(begin.size(), 4);
2312  TFLITE_DCHECK_EQ(size.size(), 4);
2313  const int start_b = begin[3];
2314  const int stop_b =
2315      size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
2316  const int start_h = begin[2];
2317  const int stop_h =
2318      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
2319  const int start_w = begin[1];
2320  const int stop_w =
2321      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
2322  const int start_d = begin[0];
2323  const int stop_d =
2324      size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
2325
2326  T* out_ptr = output_data;
2327  for (int in_b = start_b; in_b < stop_b; ++in_b) {
2328    for (int in_h = start_h; in_h < stop_h; ++in_h) {
2329      for (int in_w = start_w; in_w < stop_w; ++in_w) {
2330        for (int in_d = start_d; in_d < stop_d; ++in_d) {
2331          *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
2332        }
2333      }
2334    }
2335  }
2336}
2337
2338template <typename T>
2339inline void Mean(const T* input_data, const Dims<4>& input_dims,
2340                 const std::vector<int>& reduction_indices, T* output_data,
2341                 const Dims<4>& output_dims) {
2342  const int output_batch = ArraySize(output_dims, 3);
2343  const int output_height = ArraySize(output_dims, 2);
2344  const int output_width = ArraySize(output_dims, 1);
2345  const int output_depth = ArraySize(output_dims, 0);
2346
2347  const int input_height = ArraySize(input_dims, 2);
2348  const int input_width = ArraySize(input_dims, 1);
2349
2350  // The current implementation only supports simultaneous reduction over
2351  // width and height.
2352  TFLITE_DCHECK_EQ(reduction_indices.size(), 2);
2353  TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) ||
2354                (reduction_indices[0] == 2 && reduction_indices[1] == 1));
2355  TFLITE_DCHECK_EQ(output_height, 1);
2356  TFLITE_DCHECK_EQ(output_width, 1);
2357
2358  for (int out_b = 0; out_b < output_batch; ++out_b) {
2359    for (int out_d = 0; out_d < output_depth; ++out_d) {
2360      float value = 0;
2361      for (int in_h = 0; in_h < input_height; ++in_h) {
2362        for (int in_w = 0; in_w < input_width; ++in_w) {
2363          value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)];
2364        }
2365      }
2366      output_data[Offset(output_dims, out_d, 0, 0, out_b)] =
2367          value / (input_width * input_height);
2368    }
2369  }
2370}
2371
2372template <typename T>
2373void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
2374         const Dims<4>& input2_dims, T* output_data,
2375         const Dims<4>& output_dims) {
2376  NdArrayDesc<4> desc1;
2377  NdArrayDesc<4> desc2;
2378  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
2379
2380  // In Tensorflow, the dimensions are canonically named (batch_number, row,
2381  // col, channel), with extents (batches, height, width, depth), with the
2382  // trailing dimension changing most rapidly (channels has the smallest stride,
2383  // typically 1 element).
2384  //
2385  // In generated C code, we store arrays with the dimensions reversed. The
2386  // first dimension has smallest stride.
2387  //
2388  // We name our variables by their Tensorflow convention, but generate C code
2389  // nesting loops such that the innermost loop has the smallest stride for the
2390  // best cache behavior.
2391  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
2392    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
2393      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
2394        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
2395          output_data[Offset(output_dims, c, x, y, b)] =
2396              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
2397              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
2398        }
2399      }
2400    }
2401  }
2402}
2403
2404template <typename T>
2405void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
2406                       const T* input2_data, T* output_data,
2407                       const Dims<4>& output_dims) {
2408  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
2409  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
2410  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
2411  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
2412
2413  auto min_value = input2_data[0];
2414
2415  for (int b = 0; b < batches; b++) {
2416    for (int y = 0; y < input_height; y++) {
2417      for (int x = 0; x < input_width; x++) {
2418        for (int c = 0; c < depth; c++) {
2419          int offset = Offset(input1_dims, c, x, y, b);
2420          output_data[offset] =
2421              input1_data[offset] > min_value ? min_value : input1_data[offset];
2422        }
2423      }
2424    }
2425  }
2426}
2427
2428template <typename T>
2429void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
2430                       const T* input2_data, T* output_data,
2431                       const Dims<4>& output_dims) {
2432  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
2433  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
2434  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
2435  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
2436
2437  auto max_value = input2_data[0];
2438
2439  for (int b = 0; b < batches; b++) {
2440    for (int y = 0; y < input_height; y++) {
2441      for (int x = 0; x < input_width; x++) {
2442        for (int c = 0; c < depth; c++) {
2443          int offset = Offset(input1_dims, c, x, y, b);
2444          output_data[offset] =
2445              input1_data[offset] < max_value ? max_value : input1_data[offset];
2446        }
2447      }
2448    }
2449  }
2450}
2451
2452template <typename T1, typename T2, typename T3>
2453void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
2454            T2* output_data, const Dims<4>& output_dims) {
2455  // The current ArgMax implemention can only determine the index of the maximum
2456  // value in the last dimension. So the axis argument is ignored.
2457  TFLITE_DCHECK_EQ(axis[0], 3);
2458
2459  // For ArgMax, the number of output dimensions = (number of input dimensions -
2460  // 1). For the sake of simplicity, the output dimensions are equal to the
2461  // input dimensions here. We enforce the constraint that the last dimension
2462  // must always be 1.
2463  TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
2464  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
2465  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
2466  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
2467  const int depth = ArraySize(input_dims, 0);
2468  for (int b = 0; b < batches; ++b) {
2469    for (int y = 0; y < height; ++y) {
2470      for (int x = 0; x < width; ++x) {
2471        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
2472        int max_index = 0;
2473        for (int d = 1; d < depth; ++d) {
2474          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
2475          if (curr_value > max_value) {
2476            max_value = curr_value;
2477            max_index = d;
2478          }
2479        }
2480        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
2481      }
2482    }
2483  }
2484}
2485
2486}  // namespace reference_ops
2487}  // namespace tflite
2488
2489#endif  // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_
2490