reference_ops.h revision 6dfe00ca4114371ed47c93810219736e3deda2d2
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 16#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 17 18#include <stdint.h> 19#include <sys/types.h> 20#include <algorithm> 21#include <cmath> 22#include <limits> 23#include <memory> 24#include <type_traits> 25 26#include "third_party/eigen3/Eigen/Core" 27#include "fixedpoint/fixedpoint.h" 28#include "public/gemmlowp.h" 29#include "tensorflow/contrib/lite/kernels/internal/common.h" 30#include "tensorflow/contrib/lite/kernels/internal/round.h" 31#include "tensorflow/contrib/lite/kernels/internal/types.h" 32 33namespace tflite { 34namespace reference_ops { 35 36inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( 37 int32 x, int32 quantized_multiplier, int right_shift) { 38 using gemmlowp::RoundingDivideByPOT; 39 using gemmlowp::SaturatingRoundingDoublingHighMul; 40 return RoundingDivideByPOT( 41 SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); 42} 43 44inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( 45 int32 x, int32 quantized_multiplier, int left_shift) { 46 using gemmlowp::SaturatingRoundingDoublingHighMul; 47 return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), 48 quantized_multiplier); 49} 50 51template <typename T> 52int CountLeadingZeros(T integer_input) { 53 static_assert(std::is_unsigned<T>::value, 54 "Only unsigned integer types handled."); 55 const T one_in_leading_positive = static_cast<T>(1) 56 << (std::numeric_limits<T>::digits - 1); 57 int leading_zeros = 0; 58 while (integer_input < one_in_leading_positive) { 59 integer_input <<= 1; 60 ++leading_zeros; 61 } 62 return leading_zeros; 63} 64 65// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE 66// BROADCASTING. 67// 68// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional 69// rectangular array of numbers. 70// 71// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. 72// However, as Dims<N> is to be deprecated, this class exists as an adaptor 73// to enable simple unoptimized implementations of element-wise broadcasting 74// operations. 75template <int N> 76struct NdArrayDesc { 77 // The "extent" of each dimension. Indices along dimension d must be in the 78 // half-open interval [0, extents[d]). 79 int extents[N]; 80 81 // The number of *elements* (not bytes) between consecutive indices of each 82 // dimension. 83 int strides[N]; 84}; 85 86// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING 87// ELEMENT-WISE BROADCASTING. 88// 89// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. 90inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, 91 int i3) { 92 TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); 93 TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); 94 TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); 95 TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); 96 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + 97 i3 * desc.strides[3]; 98} 99 100// Given the dimensions of the operands for an element-wise binary broadcast, 101// adjusts them so that they can be directly iterated over with simple loops. 102// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and 103// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. 104// 105// This function assumes that the two input shapes are compatible up to 106// broadcasting and the shorter one has already been prepended with 1s to be the 107// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), 108// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that 109// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be 110// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). 111// 112// When two shapes are compatible up to broadcasting, for each dimension d, 113// the input extents are either equal, or one of them is 1. 114// 115// This function performs the following for each dimension d: 116// - If the extents are equal, then do nothing since the loop that walks over 117// both of the input arrays is correct. 118// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 119// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows 120// array0 to be referenced *at any index* in dimension d and still access the 121// same slice. 122template <int N> 123inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, 124 const Dims<N>& input1_dims, 125 NdArrayDesc<N>* desc0_out, 126 NdArrayDesc<N>* desc1_out) { 127 TFLITE_DCHECK(desc0_out != nullptr); 128 TFLITE_DCHECK(desc1_out != nullptr); 129 130 // Copy dims to desc. 131 for (int i = 0; i < N; ++i) { 132 desc0_out->extents[i] = input0_dims.sizes[i]; 133 desc0_out->strides[i] = input0_dims.strides[i]; 134 desc1_out->extents[i] = input1_dims.sizes[i]; 135 desc1_out->strides[i] = input1_dims.strides[i]; 136 } 137 138 // Walk over each dimension. If the extents are equal do nothing. 139 // Otherwise, set the desc with extent 1 to have extent equal to the other and 140 // stride 0. 141 for (int i = 0; i < N; ++i) { 142 const int extent0 = ArraySize(input0_dims, i); 143 const int extent1 = ArraySize(input1_dims, i); 144 if (extent0 != extent1) { 145 if (extent0 == 1) { 146 desc0_out->strides[i] = 0; 147 desc0_out->extents[i] = extent1; 148 } else { 149 TFLITE_DCHECK_EQ(extent1, 1); 150 desc1_out->strides[i] = 0; 151 desc1_out->extents[i] = extent0; 152 } 153 } 154 } 155} 156 157inline void Conv(const float* input_data, const Dims<4>& input_dims, 158 const float* filter_data, const Dims<4>& filter_dims, 159 const float* bias_data, const Dims<4>& bias_dims, 160 int stride_width, int stride_height, int pad_width, 161 int pad_height, float output_activation_min, 162 float output_activation_max, float* output_data, 163 const Dims<4>& output_dims, float* im2col_data, 164 const Dims<4>& im2col_dims) { 165 (void)im2col_data; // only used in optimized code. 166 (void)im2col_dims; // only used in optimized code. 167 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 168 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 169 const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); 170 if (bias_data) { 171 TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0)); 172 } 173 const int input_height = ArraySize(input_dims, 2); 174 const int input_width = ArraySize(input_dims, 1); 175 const int filter_height = ArraySize(filter_dims, 2); 176 const int filter_width = ArraySize(filter_dims, 1); 177 const int output_height = ArraySize(output_dims, 2); 178 const int output_width = ArraySize(output_dims, 1); 179 for (int batch = 0; batch < batches; ++batch) { 180 for (int out_y = 0; out_y < output_height; ++out_y) { 181 for (int out_x = 0; out_x < output_width; ++out_x) { 182 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 183 const int in_x_origin = (out_x * stride_width) - pad_width; 184 const int in_y_origin = (out_y * stride_height) - pad_height; 185 float total = 0.f; 186 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 187 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 188 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 189 const int in_x = in_x_origin + filter_x; 190 const int in_y = in_y_origin + filter_y; 191 // If the location is outside the bounds of the input image, 192 // use zero as a default value. 193 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 194 (in_y < input_height)) { 195 float input_value = input_data[Offset(input_dims, in_channel, 196 in_x, in_y, batch)]; 197 float filter_value = 198 filter_data[Offset(filter_dims, in_channel, filter_x, 199 filter_y, out_channel)]; 200 total += (input_value * filter_value); 201 } 202 } 203 } 204 } 205 float bias_value = 0.0f; 206 if (bias_data) { 207 bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 208 } 209 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 210 ActivationFunctionWithMinMax(total + bias_value, 211 output_activation_min, 212 output_activation_max); 213 } 214 } 215 } 216 } 217} 218 219// legacy, for compatibility with old checked-in code 220template <FusedActivationFunctionType Ac> 221void Conv(const float* input_data, const Dims<4>& input_dims, 222 const float* filter_data, const Dims<4>& filter_dims, 223 const float* bias_data, const Dims<4>& bias_dims, int stride_width, 224 int stride_height, int pad_width, int pad_height, float* output_data, 225 const Dims<4>& output_dims, float* im2col_data, 226 const Dims<4>& im2col_dims) { 227 float output_activation_min, output_activation_max; 228 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 229 Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, 230 stride_width, stride_height, pad_width, pad_height, 231 output_activation_min, output_activation_max, output_data, output_dims, 232 im2col_data, im2col_dims); 233} 234 235// legacy, for compatibility with old checked-in code 236template <FusedActivationFunctionType Ac> 237void Conv(const float* input_data, const Dims<4>& input_dims, 238 const float* filter_data, const Dims<4>& filter_dims, 239 const float* bias_data, const Dims<4>& bias_dims, int stride, 240 int pad_width, int pad_height, float* output_data, 241 const Dims<4>& output_dims, float* im2col_data, 242 const Dims<4>& im2col_dims) { 243 Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, 244 bias_dims, stride, stride, pad_width, pad_height, output_data, 245 output_dims, im2col_data, im2col_dims); 246} 247 248inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 249 int32 input_offset, const uint8* filter_data, 250 const Dims<4>& filter_dims, int32 filter_offset, 251 const int32* bias_data, const Dims<4>& bias_dims, 252 int stride_width, int stride_height, int pad_width, 253 int pad_height, int32 output_offset, int32 output_multiplier, 254 int output_shift, int32 output_activation_min, 255 int32 output_activation_max, uint8* output_data, 256 const Dims<4>& output_dims, uint8* im2col_data, 257 const Dims<4>& im2col_dims, 258 gemmlowp::GemmContext* gemm_context) { 259 (void)im2col_data; // only used in optimized code. 260 (void)im2col_dims; // only used in optimized code. 261 (void)gemm_context; // only used in optimized code. 262 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 263 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 264 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 265 const int output_depth = 266 MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0); 267 const int input_height = ArraySize(input_dims, 2); 268 const int input_width = ArraySize(input_dims, 1); 269 const int filter_height = ArraySize(filter_dims, 2); 270 const int filter_width = ArraySize(filter_dims, 1); 271 const int output_height = ArraySize(output_dims, 2); 272 const int output_width = ArraySize(output_dims, 1); 273 for (int batch = 0; batch < batches; ++batch) { 274 for (int out_y = 0; out_y < output_height; ++out_y) { 275 for (int out_x = 0; out_x < output_width; ++out_x) { 276 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 277 const int in_x_origin = (out_x * stride_width) - pad_width; 278 const int in_y_origin = (out_y * stride_height) - pad_height; 279 int32 acc = 0; 280 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 281 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 282 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 283 const int in_x = in_x_origin + filter_x; 284 const int in_y = in_y_origin + filter_y; 285 // If the location is outside the bounds of the input image, 286 // use zero as a default value. 287 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 288 (in_y < input_height)) { 289 int32 input_val = input_data[Offset(input_dims, in_channel, 290 in_x, in_y, batch)]; 291 int32 filter_val = 292 filter_data[Offset(filter_dims, in_channel, filter_x, 293 filter_y, out_channel)]; 294 acc += 295 (filter_val + filter_offset) * (input_val + input_offset); 296 } 297 } 298 } 299 } 300 if (bias_data) { 301 acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 302 } 303 acc = MultiplyByQuantizedMultiplierSmallerThanOne( 304 acc, output_multiplier, output_shift); 305 acc += output_offset; 306 acc = std::max(acc, output_activation_min); 307 acc = std::min(acc, output_activation_max); 308 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 309 static_cast<uint8>(acc); 310 } 311 } 312 } 313 } 314} 315 316// legacy, for compatibility with old checked-in code 317template <FusedActivationFunctionType Ac> 318inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 319 int32 input_offset, const uint8* filter_data, 320 const Dims<4>& filter_dims, int32 filter_offset, 321 const int32* bias_data, const Dims<4>& bias_dims, 322 int stride_width, int stride_height, int pad_width, 323 int pad_height, int32 output_offset, int32 output_multiplier, 324 int output_shift, int32 output_activation_min, 325 int32 output_activation_max, uint8* output_data, 326 const Dims<4>& output_dims, uint8* im2col_data, 327 const Dims<4>& im2col_dims, 328 gemmlowp::GemmContext* gemm_context) { 329 static_assert(Ac == FusedActivationFunctionType::kNone || 330 Ac == FusedActivationFunctionType::kRelu || 331 Ac == FusedActivationFunctionType::kRelu6 || 332 Ac == FusedActivationFunctionType::kRelu1, 333 ""); 334 if (Ac == FusedActivationFunctionType::kNone) { 335 TFLITE_DCHECK_EQ(output_activation_min, 0); 336 TFLITE_DCHECK_EQ(output_activation_max, 255); 337 } 338 Conv(input_data, input_dims, input_offset, filter_data, filter_dims, 339 filter_offset, bias_data, bias_dims, stride_width, stride_height, 340 pad_width, pad_height, output_offset, output_multiplier, output_shift, 341 output_activation_min, output_activation_max, output_data, output_dims, 342 im2col_data, im2col_dims, gemm_context); 343} 344 345// legacy, for compatibility with old checked-in code 346template <FusedActivationFunctionType Ac> 347void Conv(const uint8* input_data, const Dims<4>& input_dims, 348 int32 input_offset, const uint8* filter_data, 349 const Dims<4>& filter_dims, int32 filter_offset, 350 const int32* bias_data, const Dims<4>& bias_dims, int stride, 351 int pad_width, int pad_height, int32 output_offset, 352 int32 output_multiplier, int output_shift, 353 int32 output_activation_min, int32 output_activation_max, 354 uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, 355 const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { 356 Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, 357 filter_offset, bias_data, bias_dims, stride, stride, pad_width, 358 pad_height, output_offset, output_multiplier, output_shift, 359 output_activation_min, output_activation_max, output_data, 360 output_dims, im2col_data, im2col_dims, gemm_context); 361} 362 363template <typename T> 364inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, 365 int block_size, T* output_data, 366 const Dims<4>& output_dims) { 367 const int input_depth = ArraySize(input_dims, 0); 368 const int input_width = ArraySize(input_dims, 1); 369 const int input_height = ArraySize(input_dims, 2); 370 const int input_batch = ArraySize(input_dims, 3); 371 372 const int output_depth = ArraySize(output_dims, 0); 373 const int output_width = ArraySize(output_dims, 1); 374 const int output_height = ArraySize(output_dims, 2); 375 const int output_batch = ArraySize(output_dims, 3); 376 377 TFLITE_DCHECK_EQ(input_width * block_size, output_width); 378 TFLITE_DCHECK_EQ(input_height * block_size, output_height); 379 TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size); 380 TFLITE_DCHECK_EQ(input_batch, output_batch); 381 382 for (int out_b = 0; out_b < output_batch; ++out_b) { 383 for (int out_h = 0; out_h < output_height; ++out_h) { 384 for (int out_w = 0; out_w < output_width; ++out_w) { 385 for (int out_d = 0; out_d < output_depth; ++out_d) { 386 const int in_d = 387 out_d + ((out_h % block_size) * block_size + out_w % block_size) * 388 output_depth; 389 const int in_w = out_w / block_size; 390 const int in_h = out_h / block_size; 391 const int in_b = out_b; 392 393 const int output_index = 394 Offset(output_dims, out_d, out_w, out_h, out_b); 395 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 396 397 output_data[output_index] = input_data[input_index]; 398 } 399 } 400 } 401 } 402} 403 404template <typename T> 405inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, 406 int block_size, T* output_data, 407 const Dims<4>& output_dims) { 408 const int input_depth = ArraySize(input_dims, 0); 409 const int input_width = ArraySize(input_dims, 1); 410 const int input_height = ArraySize(input_dims, 2); 411 const int input_batch = ArraySize(input_dims, 3); 412 413 const int output_depth = ArraySize(output_dims, 0); 414 const int output_width = ArraySize(output_dims, 1); 415 const int output_height = ArraySize(output_dims, 2); 416 const int output_batch = ArraySize(output_dims, 3); 417 418 TFLITE_DCHECK_EQ(input_width, output_width * block_size); 419 TFLITE_DCHECK_EQ(input_height, output_height * block_size); 420 TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth); 421 TFLITE_DCHECK_EQ(input_batch, output_batch); 422 423 for (int in_b = 0; in_b < input_batch; ++in_b) { 424 for (int in_h = 0; in_h < input_height; ++in_h) { 425 for (int in_w = 0; in_w < input_width; ++in_w) { 426 for (int in_d = 0; in_d < input_depth; ++in_d) { 427 const int out_d = 428 in_d + ((in_h % block_size) * block_size + in_w % block_size) * 429 input_depth; 430 const int out_w = in_w / block_size; 431 const int out_h = in_h / block_size; 432 const int out_b = in_b; 433 434 const int output_index = 435 Offset(output_dims, out_d, out_w, out_h, out_b); 436 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 437 438 output_data[output_index] = input_data[input_index]; 439 } 440 } 441 } 442 } 443} 444 445inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, 446 const float* weights_data, 447 const Dims<4>& weights_dims, const float* bias_data, 448 const Dims<4>& bias_dims, 449 float output_activation_min, 450 float output_activation_max, float* output_data, 451 const Dims<4>& output_dims) { 452 // TODO(benoitjacob): This really should be: 453 // const int batches = ArraySize(output_dims, 1); 454 // but the current --variable_batch hack consists in overwriting the 3rd 455 // dimension with the runtime batch size, as we don't keep track for each 456 // array of which dimension is the batch dimension in it. 457 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 458 ArraySize(output_dims, 3); 459 const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); 460 const int accum_depth = ArraySize(weights_dims, 0); 461 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 462 TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); 463 for (int b = 0; b < batches; ++b) { 464 for (int out_c = 0; out_c < output_depth; ++out_c) { 465 float total = 0.f; 466 for (int d = 0; d < accum_depth; ++d) { 467 total += input_data[b * accum_depth + d] * 468 weights_data[out_c * accum_depth + d]; 469 } 470 float bias_value = 0.0f; 471 if (bias_data) { 472 bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 473 } 474 output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( 475 total + bias_value, output_activation_min, output_activation_max); 476 } 477 } 478} 479 480// legacy, for compatibility with old checked-in code 481template <FusedActivationFunctionType Ac> 482void FullyConnected(const float* input_data, const Dims<4>& input_dims, 483 const float* weights_data, const Dims<4>& weights_dims, 484 const float* bias_data, const Dims<4>& bias_dims, 485 float* output_data, const Dims<4>& output_dims) { 486 float output_activation_min, output_activation_max; 487 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 488 FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, 489 bias_dims, output_activation_min, output_activation_max, 490 output_data, output_dims); 491} 492 493inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 494 int32 input_offset, const uint8* filter_data, 495 const Dims<4>& filter_dims, int32 filter_offset, 496 const int32* bias_data, const Dims<4>& bias_dims, 497 int32 output_offset, int32 output_multiplier, 498 int output_shift, int32 output_activation_min, 499 int32 output_activation_max, uint8* output_data, 500 const Dims<4>& output_dims, 501 gemmlowp::GemmContext* gemm_context) { 502 (void)gemm_context; // only used in optimized code. 503 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 504 // TODO(benoitjacob): This really should be: 505 // const int batches = ArraySize(output_dims, 1); 506 // but the current --variable_batch hack consists in overwriting the 3rd 507 // dimension with the runtime batch size, as we don't keep track for each 508 // array of which dimension is the batch dimension in it. 509 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 510 ArraySize(output_dims, 3); 511 const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); 512 const int accum_depth = ArraySize(filter_dims, 0); 513 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 514 TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); 515 for (int b = 0; b < batches; ++b) { 516 for (int out_c = 0; out_c < output_depth; ++out_c) { 517 int32 acc = 0; 518 for (int d = 0; d < accum_depth; ++d) { 519 int32 input_val = input_data[b * accum_depth + d]; 520 int32 filter_val = filter_data[out_c * accum_depth + d]; 521 acc += (filter_val + filter_offset) * (input_val + input_offset); 522 } 523 if (bias_data) { 524 acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 525 } 526 acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier, 527 output_shift); 528 acc += output_offset; 529 acc = std::max(acc, output_activation_min); 530 acc = std::min(acc, output_activation_max); 531 output_data[out_c + output_depth * b] = static_cast<uint8>(acc); 532 } 533 } 534} 535 536// legacy, for compatibility with old checked-in code 537template <FusedActivationFunctionType Ac> 538void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 539 int32 input_offset, const uint8* filter_data, 540 const Dims<4>& filter_dims, int32 filter_offset, 541 const int32* bias_data, const Dims<4>& bias_dims, 542 int32 output_offset, int32 output_multiplier, 543 int output_shift, int32 output_activation_min, 544 int32 output_activation_max, uint8* output_data, 545 const Dims<4>& output_dims, 546 gemmlowp::GemmContext* gemm_context) { 547 static_assert(Ac == FusedActivationFunctionType::kNone || 548 Ac == FusedActivationFunctionType::kRelu || 549 Ac == FusedActivationFunctionType::kRelu6 || 550 Ac == FusedActivationFunctionType::kRelu1, 551 ""); 552 if (Ac == FusedActivationFunctionType::kNone) { 553 TFLITE_DCHECK_EQ(output_activation_min, 0); 554 TFLITE_DCHECK_EQ(output_activation_max, 255); 555 } 556 FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, 557 filter_offset, bias_data, bias_dims, output_offset, 558 output_multiplier, output_shift, output_activation_min, 559 output_activation_max, output_data, output_dims, gemm_context); 560} 561 562template <FusedActivationFunctionType Ac> 563void NonGlobalBatchNormalization( 564 const float* input_data, const Dims<4>& input_dims, const float* mean_data, 565 const Dims<4>& mean_dims, const float* multiplier_data, 566 const Dims<4>& multiplier_dims, const float* offset_data, 567 const Dims<4>& offset_dims, float* output_data, 568 const Dims<4>& output_dims) { 569 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 570 const int height = 571 MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, 572 offset_dims, 2, output_dims, 2); 573 const int width = 574 MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, 575 offset_dims, 1, output_dims, 1); 576 const int depth = 577 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 578 offset_dims, 0, output_dims, 0); 579 580 for (int b = 0; b < batches; ++b) { 581 for (int y = 0; y < height; ++y) { 582 for (int x = 0; x < width; ++x) { 583 for (int c = 0; c < depth; ++c) { 584 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 585 (input_data[Offset(input_dims, c, x, y, b)] - 586 mean_data[Offset(mean_dims, c, x, y, 0)]) * 587 multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + 588 offset_data[Offset(offset_dims, c, x, y, 0)]); 589 } 590 } 591 } 592 } 593} 594 595template <FusedActivationFunctionType Ac> 596void GlobalBatchNormalization(const float* input_data, 597 const Dims<4>& input_dims, const float* mean_data, 598 const Dims<4>& mean_dims, 599 const float* multiplier_data, 600 const Dims<4>& multiplier_dims, 601 const float* offset_data, 602 const Dims<4>& offset_dims, float* output_data, 603 const Dims<4>& output_dims) { 604 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 605 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 606 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 607 const int depth = 608 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 609 offset_dims, 0, output_dims, 0); 610 611 for (int b = 0; b < batches; ++b) { 612 for (int y = 0; y < height; ++y) { 613 for (int x = 0; x < width; ++x) { 614 for (int c = 0; c < depth; ++c) { 615 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 616 (input_data[Offset(input_dims, c, x, y, b)] - 617 mean_data[Offset(mean_dims, c, 0, 0, 0)]) * 618 multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + 619 offset_data[Offset(offset_dims, c, 0, 0, 0)]); 620 } 621 } 622 } 623 } 624} 625 626inline void Relu(const float* input_data, const Dims<4>& input_dims, 627 float* output_data, const Dims<4>& output_dims) { 628 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 629 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 630 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 631 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 632 for (int b = 0; b < batches; ++b) { 633 for (int y = 0; y < height; ++y) { 634 for (int x = 0; x < width; ++x) { 635 for (int c = 0; c < depth; ++c) { 636 float val = input_data[Offset(input_dims, c, x, y, b)]; 637 const float lower = 0; 638 float clamped = val < lower ? lower : val; 639 output_data[Offset(output_dims, c, x, y, b)] = clamped; 640 } 641 } 642 } 643 } 644} 645 646inline void Relu1(const float* input_data, const Dims<4>& input_dims, 647 float* output_data, const Dims<4>& output_dims) { 648 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 649 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 650 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 651 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 652 for (int b = 0; b < batches; ++b) { 653 for (int y = 0; y < height; ++y) { 654 for (int x = 0; x < width; ++x) { 655 for (int c = 0; c < depth; ++c) { 656 float val = input_data[Offset(input_dims, c, x, y, b)]; 657 const float upper = 1; 658 const float lower = -1; 659 float clamped = val > upper ? upper : val < lower ? lower : val; 660 output_data[Offset(output_dims, c, x, y, b)] = clamped; 661 } 662 } 663 } 664 } 665} 666 667inline void Relu6(const float* input_data, const Dims<4>& input_dims, 668 float* output_data, const Dims<4>& output_dims) { 669 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 670 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 671 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 672 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 673 for (int b = 0; b < batches; ++b) { 674 for (int y = 0; y < height; ++y) { 675 for (int x = 0; x < width; ++x) { 676 for (int c = 0; c < depth; ++c) { 677 float val = input_data[Offset(input_dims, c, x, y, b)]; 678 const float upper = 6; 679 const float lower = 0; 680 float clamped = val > upper ? upper : val < lower ? lower : val; 681 output_data[Offset(output_dims, c, x, y, b)] = clamped; 682 } 683 } 684 } 685 } 686} 687 688template <FusedActivationFunctionType Ac> 689void L2Normalization(const float* input_data, const Dims<4>& input_dims, 690 float* output_data, const Dims<4>& output_dims) { 691 static_assert(Ac == FusedActivationFunctionType::kNone, ""); 692 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 693 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 694 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 695 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 696 for (int b = 0; b < batches; ++b) { 697 for (int y = 0; y < height; ++y) { 698 for (int x = 0; x < width; ++x) { 699 float squared_l2_norm = 0; 700 for (int c = 0; c < depth; ++c) { 701 float val = input_data[Offset(input_dims, c, x, y, b)]; 702 squared_l2_norm += val * val; 703 } 704 float l2_norm = std::sqrt(squared_l2_norm); 705 for (int c = 0; c < depth; ++c) { 706 output_data[Offset(output_dims, c, x, y, b)] = 707 input_data[Offset(input_dims, c, x, y, b)] / l2_norm; 708 } 709 } 710 } 711 } 712} 713 714inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, 715 int* output_shift) { 716 *output_shift = 11; 717 while (input >= (1 << 29)) { 718 input /= 4; 719 ++*output_shift; 720 } 721 TFLITE_DCHECK_GT(input, 0); 722 const unsigned max_left_shift_bits = __builtin_clz(input) - 1; 723 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; 724 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; 725 *output_shift -= left_shift_bit_pairs; 726 input <<= 2 * left_shift_bit_pairs; 727 TFLITE_DCHECK_GE(input, (1 << 27)); 728 TFLITE_DCHECK_LT(input, (1 << 29)); 729 using gemmlowp::FixedPoint; 730 using gemmlowp::Rescale; 731 using gemmlowp::SaturatingRoundingMultiplyByPOT; 732 // Using 3 integer bits gives us enough room for the internal arithmetic in 733 // this Newton-Raphson iteration. 734 using F3 = FixedPoint<int32, 3>; 735 using F0 = FixedPoint<int32, 0>; 736 const F3 fixedpoint_input = F3::FromRaw(input >> 1); 737 const F3 fixedpoint_half_input = 738 SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); 739 const F3 fixedpoint_half_three = 740 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); 741 // Newton-Raphson iteration 742 // Naive unoptimized starting guess: x = 1 743 F3 x = F3::One(); 744 // Naive unoptimized number of iterations: 5 745 for (int i = 0; i < 5; i++) { 746 const F3 x3 = Rescale<3>(x * x * x); 747 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); 748 } 749 const F0 fixedpoint_half_sqrt_2 = 750 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); 751 x = x * fixedpoint_half_sqrt_2; 752 *output_inv_sqrt = x.raw(); 753 if (*output_shift < 0) { 754 *output_inv_sqrt <<= -*output_shift; 755 *output_shift = 0; 756 } 757} 758 759inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, 760 int32 input_zero_point, uint8* output_data, 761 const Dims<4>& output_dims) { 762 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 763 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 764 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 765 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 766 TFLITE_DCHECK_EQ(batches, 1); 767 TFLITE_DCHECK_EQ(height, 1); 768 TFLITE_DCHECK_EQ(width, 1); 769 int32 square_l2_norm = 0; 770 for (int i = 0; i < depth; i++) { 771 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 772 square_l2_norm += diff * diff; 773 } 774 int32 inv_l2norm_multiplier; 775 int inv_l2norm_shift; 776 GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, 777 &inv_l2norm_shift); 778 779 for (int i = 0; i < depth; i++) { 780 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 781 int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( 782 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); 783 int32 unclamped_output_val = 128 + rescaled_diff; 784 int32 output_val = std::min(255, std::max(0, unclamped_output_val)); 785 output_data[Offset(output_dims, i, 0, 0, 0)] = 786 static_cast<uint8>(output_val); 787 } 788} 789 790inline void Add(const float* input1_data, const Dims<4>& input1_dims, 791 const float* input2_data, const Dims<4>& input2_dims, 792 float output_activation_min, float output_activation_max, 793 float* output_data, const Dims<4>& output_dims) { 794 const int batches = 795 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 796 const int height = 797 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 798 const int width = 799 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 800 const int depth = 801 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 802 for (int b = 0; b < batches; ++b) { 803 for (int y = 0; y < height; ++y) { 804 for (int x = 0; x < width; ++x) { 805 for (int c = 0; c < depth; ++c) { 806 output_data[Offset(output_dims, c, x, y, b)] = 807 ActivationFunctionWithMinMax( 808 input1_data[Offset(input1_dims, c, x, y, b)] + 809 input2_data[Offset(input2_dims, c, x, y, b)], 810 output_activation_min, output_activation_max); 811 } 812 } 813 } 814 } 815} 816 817// legacy, for compatibility with old checked-in code 818template <FusedActivationFunctionType Ac> 819void Add(const float* input1_data, const Dims<4>& input1_dims, 820 const float* input2_data, const Dims<4>& input2_dims, 821 float* output_data, const Dims<4>& output_dims) { 822 float output_activation_min, output_activation_max; 823 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 824 825 Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 826 output_activation_max, output_data, output_dims); 827} 828 829template <FusedActivationFunctionType Ac> 830inline void Add(int left_shift, const uint8* input1_data, 831 const Dims<4>& input1_dims, int32 input1_offset, 832 int32 input1_multiplier, int input1_shift, 833 const uint8* input2_data, const Dims<4>& input2_dims, 834 int32 input2_offset, int32 input2_multiplier, int input2_shift, 835 int32 output_offset, int32 output_multiplier, int output_shift, 836 int32 output_activation_min, int32 output_activation_max, 837 uint8* output_data, const Dims<4>& output_dims) { 838 static_assert(Ac == FusedActivationFunctionType::kNone || 839 Ac == FusedActivationFunctionType::kRelu || 840 Ac == FusedActivationFunctionType::kRelu6 || 841 Ac == FusedActivationFunctionType::kRelu1, 842 ""); 843 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 844 if (Ac == FusedActivationFunctionType::kNone) { 845 TFLITE_DCHECK_EQ(output_activation_min, 0); 846 TFLITE_DCHECK_EQ(output_activation_max, 255); 847 } 848 const int batches = 849 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 850 const int height = 851 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 852 const int width = 853 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 854 const int depth = 855 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 856 for (int b = 0; b < batches; ++b) { 857 for (int y = 0; y < height; ++y) { 858 for (int x = 0; x < width; ++x) { 859 for (int c = 0; c < depth; ++c) { 860 const int32 input1_val = 861 input1_offset + input1_data[Offset(input1_dims, c, x, y, b)]; 862 const int32 input2_val = 863 input2_offset + input2_data[Offset(input2_dims, c, x, y, b)]; 864 const int32 shifted_input1_val = input1_val * (1 << left_shift); 865 const int32 shifted_input2_val = input2_val * (1 << left_shift); 866 const int32 scaled_input1_val = 867 MultiplyByQuantizedMultiplierSmallerThanOne( 868 shifted_input1_val, input1_multiplier, input1_shift); 869 const int32 scaled_input2_val = 870 MultiplyByQuantizedMultiplierSmallerThanOne( 871 shifted_input2_val, input2_multiplier, input2_shift); 872 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 873 const int32 raw_output = 874 MultiplyByQuantizedMultiplierSmallerThanOne( 875 raw_sum, output_multiplier, output_shift) + 876 output_offset; 877 const int32 clamped_output = 878 std::min(output_activation_max, 879 std::max(output_activation_min, raw_output)); 880 output_data[Offset(output_dims, c, x, y, b)] = 881 static_cast<uint8>(clamped_output); 882 } 883 } 884 } 885 } 886} 887 888// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary 889// dimensionality if the runtime code does a single loop over one dimension 890// that handles broadcasting as the base case. The code generator would then 891// generate max(D1, D2) nested for loops. 892template <FusedActivationFunctionType Ac> 893void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims, 894 const float* input2_data, const Dims<4>& input2_dims, 895 float* output_data, const Dims<4>& output_dims) { 896 gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); 897 898 NdArrayDesc<4> desc1; 899 NdArrayDesc<4> desc2; 900 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 901 902 // In Tensorflow, the dimensions are canonically named (batch_number, row, 903 // col, channel), with extents (batches, height, width, depth), with the 904 // trailing dimension changing most rapidly (channels has the smallest stride, 905 // typically 1 element). 906 // 907 // In generated C code, we store arrays with the dimensions reversed. The 908 // first dimension has smallest stride. 909 // 910 // We name our variables by their Tensorflow convention, but generate C code 911 // nesting loops such that the innermost loop has the smallest stride for the 912 // best cache behavior. 913 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 914 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 915 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 916 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 917 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 918 input1_data[SubscriptToIndex(desc1, c, x, y, b)] + 919 input2_data[SubscriptToIndex(desc2, c, x, y, b)]); 920 } 921 } 922 } 923 } 924} 925 926inline void BroadcastAdd(int left_shift, const uint8* input1_data, 927 const Dims<4>& input1_dims, int32 input1_offset, 928 int32 input1_multiplier, int input1_shift, 929 const uint8* input2_data, const Dims<4>& input2_dims, 930 int32 input2_offset, int32 input2_multiplier, 931 int input2_shift, int32 output_offset, 932 int32 output_multiplier, int output_shift, 933 int32 output_activation_min, 934 int32 output_activation_max, uint8* output_data, 935 const Dims<4>& output_dims) { 936 gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); 937 938 NdArrayDesc<4> desc1; 939 NdArrayDesc<4> desc2; 940 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 941 942 // In Tensorflow, the dimensions are canonically named (batch_number, row, 943 // col, channel), with extents (batches, height, width, depth), with the 944 // trailing dimension changing most rapidly (channels has the smallest stride, 945 // typically 1 element). 946 // 947 // In generated C code, we store arrays with the dimensions reversed. The 948 // first dimension has smallest stride. 949 // 950 // We name our variables by their Tensorflow convention, but generate C code 951 // nesting loops such that the innermost loop has the smallest stride for the 952 // best cache behavior. 953 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 954 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 955 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 956 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 957 const int32 input1_val = 958 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 959 const int32 input2_val = 960 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 961 const int32 shifted_input1_val = input1_val * (1 << left_shift); 962 const int32 shifted_input2_val = input2_val * (1 << left_shift); 963 const int32 scaled_input1_val = 964 MultiplyByQuantizedMultiplierSmallerThanOne( 965 shifted_input1_val, input1_multiplier, input1_shift); 966 const int32 scaled_input2_val = 967 MultiplyByQuantizedMultiplierSmallerThanOne( 968 shifted_input2_val, input2_multiplier, input2_shift); 969 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 970 const int32 raw_output = 971 MultiplyByQuantizedMultiplierSmallerThanOne( 972 raw_sum, output_multiplier, output_shift) + 973 output_offset; 974 const int32 clamped_output = 975 std::min(output_activation_max, 976 std::max(output_activation_min, raw_output)); 977 output_data[Offset(output_dims, c, x, y, b)] = 978 static_cast<uint8>(clamped_output); 979 } 980 } 981 } 982 } 983} 984 985template <FusedActivationFunctionType Ac> 986inline void BroadcastAdd(int left_shift, const uint8* input1_data, 987 const Dims<4>& input1_dims, int32 input1_offset, 988 int32 input1_multiplier, int input1_shift, 989 const uint8* input2_data, const Dims<4>& input2_dims, 990 int32 input2_offset, int32 input2_multiplier, 991 int input2_shift, int32 output_offset, 992 int32 output_multiplier, int output_shift, 993 int32 output_activation_min, 994 int32 output_activation_max, uint8* output_data, 995 const Dims<4>& output_dims) { 996 static_assert(Ac == FusedActivationFunctionType::kNone || 997 Ac == FusedActivationFunctionType::kRelu || 998 Ac == FusedActivationFunctionType::kRelu6 || 999 Ac == FusedActivationFunctionType::kRelu1, 1000 ""); 1001 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1002 if (Ac == FusedActivationFunctionType::kNone) { 1003 TFLITE_DCHECK_EQ(output_activation_min, 0); 1004 TFLITE_DCHECK_EQ(output_activation_max, 255); 1005 } 1006 BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, 1007 input1_multiplier, input1_shift, input2_data, input2_dims, 1008 input2_offset, input2_multiplier, input2_shift, output_offset, 1009 output_multiplier, output_shift, output_activation_min, 1010 output_activation_max, output_data, output_dims); 1011} 1012 1013inline void Mul(const float* input1_data, const Dims<4>& input1_dims, 1014 const float* input2_data, const Dims<4>& input2_dims, 1015 float output_activation_min, float output_activation_max, 1016 float* output_data, const Dims<4>& output_dims) { 1017 const int batches = 1018 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1019 const int height = 1020 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1021 const int width = 1022 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1023 const int depth = 1024 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1025 for (int b = 0; b < batches; ++b) { 1026 for (int y = 0; y < height; ++y) { 1027 for (int x = 0; x < width; ++x) { 1028 for (int c = 0; c < depth; ++c) { 1029 output_data[Offset(output_dims, c, x, y, b)] = 1030 ActivationFunctionWithMinMax( 1031 input1_data[Offset(input1_dims, c, x, y, b)] * 1032 input2_data[Offset(input2_dims, c, x, y, b)], 1033 output_activation_min, output_activation_max); 1034 } 1035 } 1036 } 1037 } 1038} 1039 1040// legacy, for compatibility with old checked-in code 1041template <FusedActivationFunctionType Ac> 1042void Mul(const float* input1_data, const Dims<4>& input1_dims, 1043 const float* input2_data, const Dims<4>& input2_dims, 1044 float* output_data, const Dims<4>& output_dims) { 1045 float output_activation_min, output_activation_max; 1046 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1047 1048 Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 1049 output_activation_max, output_data, output_dims); 1050} 1051 1052// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary 1053// dimensionality if the runtime code does a single loop over one dimension 1054// that handles broadcasting as the base case. The code generator would then 1055// generate max(D1, D2) nested for loops. 1056template <FusedActivationFunctionType Ac> 1057void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, 1058 const float* input2_data, const Dims<4>& input2_dims, 1059 float* output_data, const Dims<4>& output_dims) { 1060 gemmlowp::ScopedProfilingLabel label("BroadcastMul"); 1061 1062 NdArrayDesc<4> desc1; 1063 NdArrayDesc<4> desc2; 1064 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1065 1066 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1067 // col, channel), with extents (batches, height, width, depth), with the 1068 // trailing dimension changing most rapidly (channels has the smallest 1069 // stride, typically 1 element). 1070 // 1071 // In generated C code, we store arrays with the dimensions reversed. The 1072 // first dimension has smallest stride. 1073 // 1074 // We name our variables by their Tensorflow convention, but generate C code 1075 // nesting loops such that the innermost loop has the smallest stride for 1076 // the best cache behavior. 1077 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1078 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1079 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1080 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1081 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 1082 input1_data[SubscriptToIndex(desc1, c, x, y, b)] * 1083 input2_data[SubscriptToIndex(desc2, c, x, y, b)]); 1084 } 1085 } 1086 } 1087 } 1088} 1089 1090inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1091 int32 input1_offset, const uint8* input2_data, 1092 const Dims<4>& input2_dims, int32 input2_offset, 1093 int32 output_offset, int32 output_multiplier, 1094 int output_shift, int32 output_activation_min, 1095 int32 output_activation_max, uint8* output_data, 1096 const Dims<4>& output_dims) { 1097 gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); 1098 1099 NdArrayDesc<4> desc1; 1100 NdArrayDesc<4> desc2; 1101 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1102 1103 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1104 // col, channel), with extents (batches, height, width, depth), with the 1105 // trailing dimension changing most rapidly (channels has the smallest 1106 // stride, typically 1 element). 1107 // 1108 // In generated C code, we store arrays with the dimensions reversed. The 1109 // first dimension has smallest stride. 1110 // 1111 // We name our variables by their Tensorflow convention, but generate C code 1112 // nesting loops such that the innermost loop has the smallest stride for 1113 // the best cache behavior. 1114 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1115 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1116 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1117 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1118 const int32 input1_val = 1119 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 1120 const int32 input2_val = 1121 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 1122 const int32 unclamped_result = 1123 output_offset + 1124 MultiplyByQuantizedMultiplierSmallerThanOne( 1125 input1_val * input2_val, output_multiplier, output_shift); 1126 const int32 clamped_output = 1127 std::min(output_activation_max, 1128 std::max(output_activation_min, unclamped_result)); 1129 output_data[Offset(output_dims, c, x, y, b)] = 1130 static_cast<uint8>(clamped_output); 1131 } 1132 } 1133 } 1134 } 1135} 1136 1137// legacy, for compatibility with old checked-in code 1138template <FusedActivationFunctionType Ac> 1139inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1140 int32 input1_offset, const uint8* input2_data, 1141 const Dims<4>& input2_dims, int32 input2_offset, 1142 int32 output_offset, int32 output_multiplier, 1143 int output_shift, int32 output_activation_min, 1144 int32 output_activation_max, uint8* output_data, 1145 const Dims<4>& output_dims) { 1146 BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, 1147 input2_dims, input2_offset, output_offset, output_multiplier, 1148 output_shift, output_activation_min, output_activation_max, 1149 output_data, output_dims); 1150} 1151 1152inline void Div(const float* input1_data, const Dims<4>& input1_dims, 1153 const float* input2_data, const Dims<4>& input2_dims, 1154 float output_activation_min, float output_activation_max, 1155 float* output_data, const Dims<4>& output_dims) { 1156 const int batches = 1157 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1158 const int height = 1159 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1160 const int width = 1161 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1162 const int depth = 1163 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1164 for (int b = 0; b < batches; ++b) { 1165 for (int y = 0; y < height; ++y) { 1166 for (int x = 0; x < width; ++x) { 1167 for (int c = 0; c < depth; ++c) { 1168 output_data[Offset(output_dims, c, x, y, b)] = 1169 ActivationFunctionWithMinMax( 1170 input1_data[Offset(input1_dims, c, x, y, b)] / 1171 input2_data[Offset(input2_dims, c, x, y, b)], 1172 output_activation_min, output_activation_max); 1173 } 1174 } 1175 } 1176 } 1177} 1178 1179inline void Sub(const float* input1_data, const Dims<4>& input1_dims, 1180 const float* input2_data, const Dims<4>& input2_dims, 1181 float output_activation_min, float output_activation_max, 1182 float* output_data, const Dims<4>& output_dims) { 1183 const int batches = 1184 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1185 const int height = 1186 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1187 const int width = 1188 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1189 const int depth = 1190 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1191 for (int b = 0; b < batches; ++b) { 1192 for (int y = 0; y < height; ++y) { 1193 for (int x = 0; x < width; ++x) { 1194 for (int c = 0; c < depth; ++c) { 1195 output_data[Offset(output_dims, c, x, y, b)] = 1196 ActivationFunctionWithMinMax( 1197 input1_data[Offset(input1_dims, c, x, y, b)] - 1198 input2_data[Offset(input2_dims, c, x, y, b)], 1199 output_activation_min, output_activation_max); 1200 } 1201 } 1202 } 1203 } 1204} 1205 1206template <FusedActivationFunctionType Ac, typename Scalar> 1207void Concatenation(int concat_dim, const Scalar* const* input_data, 1208 const Dims<4>* const* input_dims, int inputs_count, 1209 Scalar* output_data, const Dims<4>& output_dims) { 1210 TFLITE_DCHECK_GT(inputs_count, 1); 1211 int concat_size = 0; 1212 for (int i = 0; i < inputs_count; i++) { 1213 for (int j = 0; j < 4; j++) { 1214 if (j != concat_dim) { 1215 MatchingArraySize(*input_dims[i], j, output_dims, j); 1216 } 1217 } 1218 concat_size += ArraySize(*input_dims[i], concat_dim); 1219 } 1220 TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); 1221 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1222 int outer_size = 1; 1223 for (int i = concat_dim + 1; i < 4; i++) { 1224 outer_size *= output_dims.sizes[i]; 1225 } 1226 Scalar* output_ptr = output_data; 1227 for (int k = 0; k < outer_size; k++) { 1228 for (int i = 0; i < inputs_count; ++i) { 1229 const int copy_size = 1230 input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; 1231 memcpy(output_ptr, input_data[i] + k * copy_size, 1232 copy_size * sizeof(Scalar)); 1233 output_ptr += copy_size; 1234 } 1235 } 1236} 1237 1238template <FusedActivationFunctionType Ac, typename Scalar> 1239void DepthConcatenation(const Scalar* const* input_data, 1240 const Dims<4>* const* input_dims, int inputs_count, 1241 Scalar* output_data, const Dims<4>& output_dims) { 1242 Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, 1243 output_data, output_dims); 1244} 1245 1246inline void LstmCell(const float* input_data, const Dims<4>& input_dims, 1247 const float* prev_activ_data, 1248 const Dims<4>& prev_activ_dims, const float* weights_data, 1249 const Dims<4>& weights_dims, const float* bias_data, 1250 const Dims<4>& bias_dims, const float* prev_state_data, 1251 const Dims<4>& prev_state_dims, float* output_state_data, 1252 const Dims<4>& output_state_dims, float* output_activ_data, 1253 const Dims<4>& output_activ_dims, float* concat_temp_data, 1254 const Dims<4>& concat_temp_dims, float* activ_temp_data, 1255 const Dims<4>& activ_temp_dims) { 1256 const int batches = 1257 MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, 1258 output_state_dims, 3, output_activ_dims, 3); 1259 const int height = 1260 MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, 1261 output_state_dims, 2, output_activ_dims, 2); 1262 const int width = 1263 MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, 1264 output_state_dims, 1, output_activ_dims, 1); 1265 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 1266 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 1267 const int input_depth = ArraySize(input_dims, 0); 1268 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 1269 const int total_input_depth = prev_activ_depth + input_depth; 1270 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 1271 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1272 1); 1273 const int intern_activ_depth = 1274 MatchingArraySize(weights_dims, 1, bias_dims, 0); 1275 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 1276 const int output_depth = 1277 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 1278 output_state_dims, 0, output_activ_dims, 0); 1279 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 1280 1281 // Concatenate prev_activ and input data together 1282 std::vector<float const*> concat_input_arrays_data; 1283 std::vector<Dims<4> const*> concat_input_arrays_dims; 1284 concat_input_arrays_data.push_back(input_data); 1285 concat_input_arrays_data.push_back(prev_activ_data); 1286 concat_input_arrays_dims.push_back(&input_dims); 1287 concat_input_arrays_dims.push_back(&prev_activ_dims); 1288 Concatenation<FusedActivationFunctionType::kNone, float>( 1289 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), 1290 concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); 1291 1292 // Fully connected 1293 FullyConnected<FusedActivationFunctionType::kNone>( 1294 concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, 1295 bias_dims, activ_temp_data, activ_temp_dims); 1296 1297 // Memory state update (the LSTM "guts") 1298 for (int b = 0; b < batches; ++b) { 1299 for (int w = 0; w < width; ++w) { 1300 for (int h = 0; h < height; ++h) { 1301 for (int c = 0; c < output_depth; ++c) { 1302 const float input_gate = 1303 1.f / 1304 (1.f + std::exp(-activ_temp_data[Offset( 1305 activ_temp_dims, 0 * output_depth + c, w, h, b)])); 1306 const float new_input = std::tanh(activ_temp_data[Offset( 1307 activ_temp_dims, 1 * output_depth + c, w, h, b)]); 1308 const float forget_gate = 1309 1.f / 1310 (1.f + std::exp(-activ_temp_data[Offset( 1311 activ_temp_dims, 2 * output_depth + c, w, h, b)])); 1312 const float output_gate = 1313 1.f / 1314 (1.f + std::exp(-activ_temp_data[Offset( 1315 activ_temp_dims, 3 * output_depth + c, w, h, b)])); 1316 const float new_state = 1317 input_gate * new_input + 1318 forget_gate * 1319 prev_state_data[Offset(prev_state_dims, c, w, h, b)]; 1320 output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state; 1321 output_activ_data[Offset(output_activ_dims, c, w, h, b)] = 1322 output_gate * std::tanh(new_state); 1323 } 1324 } 1325 } 1326 } 1327} 1328 1329template <FusedActivationFunctionType Ac, typename Scalar> 1330void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, 1331 int outputs_count, Scalar* const* output_data, 1332 const Dims<4>* const* output_dims) { 1333 TFLITE_DCHECK_GE(outputs_count, 1); 1334 for (int i = 0; i < outputs_count; i++) { 1335 /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); 1336 /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); 1337 /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); 1338 } 1339 const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3); 1340 const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2); 1341 const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1); 1342 // for now we dont have a model with a TensorFlowSplit 1343 // with fused activation function. 1344 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1345 for (int b = 0; b < batches; ++b) { 1346 for (int y = 0; y < height; ++y) { 1347 for (int x = 0; x < width; ++x) { 1348 int in_c = 0; 1349 for (int i = 0; i < outputs_count; ++i) { 1350 const int depth = ArraySize(*output_dims[i], 0); 1351 for (int c = 0; c < depth; ++c) { 1352 output_data[i][Offset(*output_dims[i], c, x, y, b)] = 1353 input_data[Offset(input_dims, in_c, x, y, b)]; 1354 in_c++; 1355 } 1356 } 1357 TFLITE_DCHECK(in_c == ArraySize(input_dims, 0)); 1358 } 1359 } 1360 } 1361} 1362 1363// TODO(benoitjacob) make this a proper reference impl without Eigen! 1364template <typename Scalar> 1365using MatrixMap = typename std::conditional< 1366 std::is_const<Scalar>::value, 1367 Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, 1368 Eigen::Dynamic, Eigen::Dynamic>>, 1369 Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; 1370 1371template <typename Scalar, int N> 1372MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, 1373 const Dims<N>& dims) { 1374 const int rows = dims.sizes[0]; 1375 int cols = 1; 1376 for (int d = 1; d < N; d++) { 1377 cols *= dims.sizes[d]; 1378 } 1379 return MatrixMap<Scalar>(data, rows, cols); 1380} 1381 1382template <typename Scalar, int N> 1383MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, 1384 const Dims<N>& dims) { 1385 const int cols = dims.sizes[N - 1]; 1386 int rows = 1; 1387 for (int d = 0; d < N - 1; d++) { 1388 rows *= dims.sizes[d]; 1389 } 1390 return MatrixMap<Scalar>(data, rows, cols); 1391} 1392 1393inline int NodeOffset(int b, int h, int w, int height, int width) { 1394 return (b * height + h) * width + w; 1395} 1396 1397inline void AveragePool(const float* input_data, const Dims<4>& input_dims, 1398 int stride_width, int stride_height, int pad_width, 1399 int pad_height, int filter_width, int filter_height, 1400 float output_activation_min, 1401 float output_activation_max, float* output_data, 1402 const Dims<4>& output_dims) { 1403 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1404 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1405 const int input_height = ArraySize(input_dims, 2); 1406 const int input_width = ArraySize(input_dims, 1); 1407 const int output_height = ArraySize(output_dims, 2); 1408 const int output_width = ArraySize(output_dims, 1); 1409 for (int batch = 0; batch < batches; ++batch) { 1410 for (int out_y = 0; out_y < output_height; ++out_y) { 1411 for (int out_x = 0; out_x < output_width; ++out_x) { 1412 for (int channel = 0; channel < depth; ++channel) { 1413 const int in_x_origin = (out_x * stride_width) - pad_width; 1414 const int in_y_origin = (out_y * stride_height) - pad_height; 1415 // Compute the boundaries of the filter region clamped so as to 1416 // ensure that the filter window fits in the input array. 1417 const int filter_x_start = std::max(0, -in_x_origin); 1418 const int filter_x_end = 1419 std::min(filter_width, input_width - in_x_origin); 1420 const int filter_y_start = std::max(0, -in_y_origin); 1421 const int filter_y_end = 1422 std::min(filter_height, input_height - in_y_origin); 1423 float total = 0.f; 1424 float filter_count = 0; 1425 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1426 ++filter_y) { 1427 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1428 ++filter_x) { 1429 const int in_x = in_x_origin + filter_x; 1430 const int in_y = in_y_origin + filter_y; 1431 total += 1432 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1433 filter_count++; 1434 } 1435 } 1436 const float average = total / filter_count; 1437 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1438 ActivationFunctionWithMinMax(average, output_activation_min, 1439 output_activation_max); 1440 } 1441 } 1442 } 1443 } 1444} 1445 1446// legacy, for compatibility with old checked-in code 1447template <FusedActivationFunctionType Ac> 1448void AveragePool(const float* input_data, const Dims<4>& input_dims, 1449 int stride_width, int stride_height, int pad_width, 1450 int pad_height, int filter_width, int filter_height, 1451 float* output_data, const Dims<4>& output_dims) { 1452 float output_activation_min, output_activation_max; 1453 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1454 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1455 pad_height, filter_width, filter_height, output_activation_min, 1456 output_activation_max, output_data, output_dims); 1457} 1458 1459// legacy, for compatibility with old checked-in code 1460template <FusedActivationFunctionType Ac> 1461void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, 1462 int pad_width, int pad_height, int filter_width, 1463 int filter_height, float* output_data, 1464 const Dims<4>& output_dims) { 1465 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1466 filter_width, filter_height, output_data, output_dims); 1467} 1468 1469inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1470 int stride_width, int stride_height, int pad_width, 1471 int pad_height, int filter_width, int filter_height, 1472 int32 output_activation_min, 1473 int32 output_activation_max, uint8* output_data, 1474 const Dims<4>& output_dims) { 1475 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1476 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1477 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1478 const int input_height = ArraySize(input_dims, 2); 1479 const int input_width = ArraySize(input_dims, 1); 1480 const int output_height = ArraySize(output_dims, 2); 1481 const int output_width = ArraySize(output_dims, 1); 1482 for (int batch = 0; batch < batches; ++batch) { 1483 for (int out_y = 0; out_y < output_height; ++out_y) { 1484 for (int out_x = 0; out_x < output_width; ++out_x) { 1485 for (int channel = 0; channel < depth; ++channel) { 1486 const int in_x_origin = (out_x * stride_width) - pad_width; 1487 const int in_y_origin = (out_y * stride_height) - pad_height; 1488 // Compute the boundaries of the filter region clamped so as to 1489 // ensure that the filter window fits in the input array. 1490 const int filter_x_start = std::max(0, -in_x_origin); 1491 const int filter_x_end = 1492 std::min(filter_width, input_width - in_x_origin); 1493 const int filter_y_start = std::max(0, -in_y_origin); 1494 const int filter_y_end = 1495 std::min(filter_height, input_height - in_y_origin); 1496 int32 acc = 0; 1497 int filter_count = 0; 1498 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1499 ++filter_y) { 1500 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1501 ++filter_x) { 1502 const int in_x = in_x_origin + filter_x; 1503 const int in_y = in_y_origin + filter_y; 1504 acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1505 filter_count++; 1506 } 1507 } 1508 acc = (acc + filter_count / 2) / filter_count; 1509 acc = std::max(acc, output_activation_min); 1510 acc = std::min(acc, output_activation_max); 1511 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1512 static_cast<uint8>(acc); 1513 } 1514 } 1515 } 1516 } 1517} 1518 1519// legacy, for compatibility with old checked-in code 1520template <FusedActivationFunctionType Ac> 1521void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1522 int stride_width, int stride_height, int pad_width, 1523 int pad_height, int filter_width, int filter_height, 1524 int32 output_activation_min, int32 output_activation_max, 1525 uint8* output_data, const Dims<4>& output_dims) { 1526 static_assert(Ac == FusedActivationFunctionType::kNone || 1527 Ac == FusedActivationFunctionType::kRelu || 1528 Ac == FusedActivationFunctionType::kRelu6 || 1529 Ac == FusedActivationFunctionType::kRelu1, 1530 ""); 1531 if (Ac == FusedActivationFunctionType::kNone) { 1532 TFLITE_DCHECK_EQ(output_activation_min, 0); 1533 TFLITE_DCHECK_EQ(output_activation_max, 255); 1534 } 1535 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1536 pad_height, filter_width, filter_height, output_activation_min, 1537 output_activation_max, output_data, output_dims); 1538} 1539 1540// legacy, for compatibility with old checked-in code 1541template <FusedActivationFunctionType Ac> 1542void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, 1543 int pad_width, int pad_height, int filter_width, 1544 int filter_height, int32 output_activation_min, 1545 int32 output_activation_max, uint8* output_data, 1546 const Dims<4>& output_dims) { 1547 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1548 filter_width, filter_height, output_activation_min, 1549 output_activation_max, output_data, output_dims); 1550} 1551 1552inline void L2Pool(const float* input_data, const Dims<4>& input_dims, 1553 int stride_width, int stride_height, int pad_width, 1554 int pad_height, int filter_width, int filter_height, 1555 float output_activation_min, float output_activation_max, 1556 float* output_data, const Dims<4>& output_dims) { 1557 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1558 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1559 const int input_height = ArraySize(input_dims, 2); 1560 const int input_width = ArraySize(input_dims, 1); 1561 const int output_height = ArraySize(output_dims, 2); 1562 const int output_width = ArraySize(output_dims, 1); 1563 for (int batch = 0; batch < batches; ++batch) { 1564 for (int out_y = 0; out_y < output_height; ++out_y) { 1565 for (int out_x = 0; out_x < output_width; ++out_x) { 1566 for (int channel = 0; channel < depth; ++channel) { 1567 const int in_x_origin = (out_x * stride_width) - pad_width; 1568 const int in_y_origin = (out_y * stride_height) - pad_height; 1569 // Compute the boundaries of the filter region clamped so as to 1570 // ensure that the filter window fits in the input array. 1571 const int filter_x_start = std::max(0, -in_x_origin); 1572 const int filter_x_end = 1573 std::min(filter_width, input_width - in_x_origin); 1574 const int filter_y_start = std::max(0, -in_y_origin); 1575 const int filter_y_end = 1576 std::min(filter_height, input_height - in_y_origin); 1577 float sum_squares = 0.f; 1578 int filter_count = 0; 1579 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1580 ++filter_y) { 1581 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1582 ++filter_x) { 1583 const int in_x = in_x_origin + filter_x; 1584 const int in_y = in_y_origin + filter_y; 1585 const float val = 1586 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1587 sum_squares += val * val; 1588 filter_count++; 1589 } 1590 } 1591 const float l2pool_result = std::sqrt(sum_squares / filter_count); 1592 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1593 ActivationFunctionWithMinMax(l2pool_result, output_activation_min, 1594 output_activation_max); 1595 } 1596 } 1597 } 1598 } 1599} 1600 1601// legacy, for compatibility with old checked-in code 1602template <FusedActivationFunctionType Ac> 1603void L2Pool(const float* input_data, const Dims<4>& input_dims, 1604 int stride_width, int stride_height, int pad_width, int pad_height, 1605 int filter_width, int filter_height, float* output_data, 1606 const Dims<4>& output_dims) { 1607 float output_activation_min, output_activation_max; 1608 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1609 1610 L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, 1611 pad_height, filter_width, filter_height, output_activation_min, 1612 output_activation_max, output_data, output_dims); 1613} 1614 1615// legacy, for compatibility with old checked-in code 1616template <FusedActivationFunctionType Ac> 1617void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, 1618 int pad_width, int pad_height, int filter_width, int filter_height, 1619 float* output_data, const Dims<4>& output_dims) { 1620 L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1621 filter_width, filter_height, output_data, output_dims); 1622} 1623 1624inline void MaxPool(const float* input_data, const Dims<4>& input_dims, 1625 int stride_width, int stride_height, int pad_width, 1626 int pad_height, int filter_width, int filter_height, 1627 float output_activation_min, float output_activation_max, 1628 float* output_data, const Dims<4>& output_dims) { 1629 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1630 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1631 const int input_height = ArraySize(input_dims, 2); 1632 const int input_width = ArraySize(input_dims, 1); 1633 const int output_height = ArraySize(output_dims, 2); 1634 const int output_width = ArraySize(output_dims, 1); 1635 for (int batch = 0; batch < batches; ++batch) { 1636 for (int out_y = 0; out_y < output_height; ++out_y) { 1637 for (int out_x = 0; out_x < output_width; ++out_x) { 1638 for (int channel = 0; channel < depth; ++channel) { 1639 const int in_x_origin = (out_x * stride_width) - pad_width; 1640 const int in_y_origin = (out_y * stride_height) - pad_height; 1641 // Compute the boundaries of the filter region clamped so as to 1642 // ensure that the filter window fits in the input array. 1643 const int filter_x_start = std::max(0, -in_x_origin); 1644 const int filter_x_end = 1645 std::min(filter_width, input_width - in_x_origin); 1646 const int filter_y_start = std::max(0, -in_y_origin); 1647 const int filter_y_end = 1648 std::min(filter_height, input_height - in_y_origin); 1649 float max = std::numeric_limits<float>::lowest(); 1650 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1651 ++filter_y) { 1652 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1653 ++filter_x) { 1654 const int in_x = in_x_origin + filter_x; 1655 const int in_y = in_y_origin + filter_y; 1656 max = std::max( 1657 max, 1658 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 1659 } 1660 } 1661 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1662 ActivationFunctionWithMinMax(max, output_activation_min, 1663 output_activation_max); 1664 } 1665 } 1666 } 1667 } 1668} 1669 1670// legacy, for compatibility with old checked-in code 1671template <FusedActivationFunctionType Ac> 1672void MaxPool(const float* input_data, const Dims<4>& input_dims, 1673 int stride_width, int stride_height, int pad_width, int pad_height, 1674 int filter_width, int filter_height, float* output_data, 1675 const Dims<4>& output_dims) { 1676 float output_activation_min, output_activation_max; 1677 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1678 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 1679 pad_height, filter_width, filter_height, output_activation_min, 1680 output_activation_max, output_data, output_dims); 1681} 1682 1683// legacy, for compatibility with old checked-in code 1684template <FusedActivationFunctionType Ac> 1685void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, 1686 int pad_width, int pad_height, int filter_width, int filter_height, 1687 float* output_data, const Dims<4>& output_dims) { 1688 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1689 filter_width, filter_height, output_data, output_dims); 1690} 1691 1692inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 1693 int stride_width, int stride_height, int pad_width, 1694 int pad_height, int filter_width, int filter_height, 1695 int32 output_activation_min, int32 output_activation_max, 1696 uint8* output_data, const Dims<4>& output_dims) { 1697 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1698 TFLITE_DCHECK_GE(output_activation_min, 0); 1699 TFLITE_DCHECK_LE(output_activation_max, 255); 1700 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1701 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1702 const int input_height = ArraySize(input_dims, 2); 1703 const int input_width = ArraySize(input_dims, 1); 1704 const int output_height = ArraySize(output_dims, 2); 1705 const int output_width = ArraySize(output_dims, 1); 1706 for (int batch = 0; batch < batches; ++batch) { 1707 for (int out_y = 0; out_y < output_height; ++out_y) { 1708 for (int out_x = 0; out_x < output_width; ++out_x) { 1709 for (int channel = 0; channel < depth; ++channel) { 1710 const int in_x_origin = (out_x * stride_width) - pad_width; 1711 const int in_y_origin = (out_y * stride_height) - pad_height; 1712 // Compute the boundaries of the filter region clamped so as to 1713 // ensure that the filter window fits in the input array. 1714 const int filter_x_start = std::max(0, -in_x_origin); 1715 const int filter_x_end = 1716 std::min(filter_width, input_width - in_x_origin); 1717 const int filter_y_start = std::max(0, -in_y_origin); 1718 const int filter_y_end = 1719 std::min(filter_height, input_height - in_y_origin); 1720 uint8 max = 0; 1721 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1722 ++filter_y) { 1723 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1724 ++filter_x) { 1725 const int in_x = in_x_origin + filter_x; 1726 const int in_y = in_y_origin + filter_y; 1727 max = std::max( 1728 max, 1729 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 1730 } 1731 } 1732 max = std::max<uint8>(max, output_activation_min); 1733 max = std::min<uint8>(max, output_activation_max); 1734 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1735 static_cast<uint8>(max); 1736 } 1737 } 1738 } 1739 } 1740} 1741 1742// legacy, for compatibility with old checked-in code 1743template <FusedActivationFunctionType Ac> 1744void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 1745 int stride_width, int stride_height, int pad_width, int pad_height, 1746 int filter_width, int filter_height, int32 output_activation_min, 1747 int32 output_activation_max, uint8* output_data, 1748 const Dims<4>& output_dims) { 1749 static_assert(Ac == FusedActivationFunctionType::kNone || 1750 Ac == FusedActivationFunctionType::kRelu || 1751 Ac == FusedActivationFunctionType::kRelu6 || 1752 Ac == FusedActivationFunctionType::kRelu1, 1753 ""); 1754 if (Ac == FusedActivationFunctionType::kNone) { 1755 TFLITE_DCHECK_EQ(output_activation_min, 0); 1756 TFLITE_DCHECK_EQ(output_activation_max, 255); 1757 } 1758 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 1759 pad_height, filter_width, filter_height, output_activation_min, 1760 output_activation_max, output_data, output_dims); 1761} 1762 1763// legacy, for compatibility with old checked-in code 1764template <FusedActivationFunctionType Ac> 1765void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, 1766 int pad_width, int pad_height, int filter_width, int filter_height, 1767 int32 output_activation_min, int32 output_activation_max, 1768 uint8* output_data, const Dims<4>& output_dims) { 1769 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1770 filter_width, filter_height, output_activation_min, 1771 output_activation_max, output_data, output_dims); 1772} 1773 1774inline void LocalResponseNormalization(const float* input_data, 1775 const Dims<4>& input_dims, int range, 1776 float bias, float alpha, float beta, 1777 float* output_data, 1778 const Dims<4>& output_dims) { 1779 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1780 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1781 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1782 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1783 1784 for (int b = 0; b < batches; ++b) { 1785 for (int y = 0; y < height; ++y) { 1786 for (int x = 0; x < width; ++x) { 1787 for (int c = 0; c < depth; ++c) { 1788 const int begin_input_c = std::max(0, c - range); 1789 const int end_input_c = std::min(depth, c + range); 1790 float accum = 0.f; 1791 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { 1792 const float input_val = 1793 input_data[Offset(input_dims, input_c, x, y, b)]; 1794 accum += input_val * input_val; 1795 } 1796 const float multiplier = std::pow(bias + alpha * accum, -beta); 1797 output_data[Offset(output_dims, c, x, y, b)] = 1798 input_data[Offset(input_dims, c, x, y, b)] * multiplier; 1799 } 1800 } 1801 } 1802 } 1803} 1804 1805inline void Softmax(const float* input_data, const Dims<4>& input_dims, 1806 float beta, float* output_data, 1807 const Dims<4>& output_dims) { 1808 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1809 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1810 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1811 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1812 1813 for (int b = 0; b < batches; ++b) { 1814 for (int y = 0; y < height; ++y) { 1815 for (int x = 0; x < width; ++x) { 1816 // Find max element value which we'll use to ensure numerical stability 1817 // taking advantage of the following equality: 1818 // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) 1819 float max = std::numeric_limits<float>::lowest(); 1820 for (int c = 0; c < depth; ++c) { 1821 max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]); 1822 } 1823 1824 // Compute sum. 1825 float sum = 0.f; 1826 for (int c = 0; c < depth; ++c) { 1827 sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 1828 beta); 1829 } 1830 1831 // Compute result. 1832 for (int c = 0; c < depth; ++c) { 1833 output_data[Offset(output_dims, c, x, y, b)] = 1834 std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 1835 beta) / 1836 sum; 1837 } 1838 } 1839 } 1840 } 1841} 1842 1843inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, 1844 int32 input_beta_multiplier, int32 input_beta_left_shift, 1845 int diff_min, uint8* output_data, 1846 const Dims<4>& output_dims) { 1847 // The representation chosen for the input to the exp() function is Q5.26. 1848 // We need to leave extra space since values that we skip might be as large as 1849 // -32 before multiplying by input_beta_multiplier, and therefore as large as 1850 // -16 afterwards. Note that exp(-8) is definitely not insignificant to 1851 // accumulation, but exp(-16) definitely is. 1852 static const int kScaledDiffIntegerBits = 5; 1853 static const int kAccumulationIntegerBits = 12; 1854 using FixedPointScaledDiff = 1855 gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; 1856 using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; 1857 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 1858 1859 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1860 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1861 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1862 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1863 1864 for (int b = 0; b < batches; ++b) { 1865 for (int x = 0; x < width; ++x) { 1866 for (int y = 0; y < height; ++y) { 1867 uint8 max_in_row = 0; 1868 for (int c = 0; c < depth; ++c) { 1869 max_in_row = 1870 std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]); 1871 } 1872 1873 FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); 1874 for (int c = 0; c < depth; ++c) { 1875 int32 input_diff = 1876 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 1877 max_in_row; 1878 if (input_diff >= diff_min) { 1879 const int32 input_diff_rescaled = 1880 MultiplyByQuantizedMultiplierGreaterThanOne( 1881 input_diff, input_beta_multiplier, input_beta_left_shift); 1882 const FixedPointScaledDiff scaled_diff_f8 = 1883 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 1884 sum_of_exps = 1885 sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( 1886 exp_on_negative_values(scaled_diff_f8)); 1887 } 1888 } 1889 1890 int32 fixed_sum_of_exps = sum_of_exps.raw(); 1891 int headroom_plus_one = 1892 CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); 1893 // This is the number of bits to the left of the binary point above 1.0. 1894 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and 1895 // no later adjustment will be needed. 1896 int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; 1897 int32 shifted_sum_minus_one = static_cast<int32>( 1898 (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - 1899 (static_cast<uint32>(1) << 31)); 1900 1901 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( 1902 FixedPoint0::FromRaw(shifted_sum_minus_one)); 1903 1904 for (int c = 0; c < depth; ++c) { 1905 int32 input_diff = 1906 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 1907 max_in_row; 1908 if (input_diff >= diff_min) { 1909 const int32 input_diff_rescaled = 1910 MultiplyByQuantizedMultiplierGreaterThanOne( 1911 input_diff, input_beta_multiplier, input_beta_left_shift); 1912 const FixedPointScaledDiff scaled_diff_f8 = 1913 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 1914 1915 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); 1916 int32 unsat_output = gemmlowp::RoundingDivideByPOT( 1917 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); 1918 1919 output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>( 1920 std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); 1921 1922 } else { 1923 output_data[Offset(output_dims, c, x, y, b)] = 0; 1924 } 1925 } 1926 } 1927 } 1928 } 1929} 1930 1931inline void Logistic(const float* input_data, const Dims<4>& input_dims, 1932 float* output_data, const Dims<4>& output_dims) { 1933 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1934 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1935 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1936 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1937 for (int b = 0; b < batches; ++b) { 1938 for (int y = 0; y < height; ++y) { 1939 for (int x = 0; x < width; ++x) { 1940 for (int c = 0; c < depth; ++c) { 1941 float val = input_data[Offset(input_dims, c, x, y, b)]; 1942 float result = 1.f / (1.f + std::exp(-val)); 1943 output_data[Offset(output_dims, c, x, y, b)] = result; 1944 } 1945 } 1946 } 1947 } 1948} 1949 1950inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, 1951 int32 input_zero_point, int32 input_range_radius, 1952 int32 input_multiplier, int input_left_shift, 1953 uint8* output_data, const Dims<4>& output_dims) { 1954 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1955 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1956 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1957 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1958 for (int b = 0; b < batches; ++b) { 1959 for (int y = 0; y < height; ++y) { 1960 for (int x = 0; x < width; ++x) { 1961 for (int c = 0; c < depth; ++c) { 1962 const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; 1963 const int32 input_val_centered = 1964 static_cast<int32>(input_val_u8) - input_zero_point; 1965 uint8 output_val; 1966 if (input_val_centered <= -input_range_radius) { 1967 output_val = 0; 1968 } else if (input_val_centered >= input_range_radius) { 1969 output_val = 255; 1970 } else { 1971 const int32 input_val_rescaled = 1972 MultiplyByQuantizedMultiplierGreaterThanOne( 1973 input_val_centered, input_multiplier, input_left_shift); 1974 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 1975 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 1976 const FixedPoint4 input_val_f4 = 1977 FixedPoint4::FromRaw(input_val_rescaled); 1978 const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); 1979 using gemmlowp::RoundingDivideByPOT; 1980 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); 1981 if (output_val_s32 == 256) { 1982 output_val_s32 = 255; 1983 } 1984 TFLITE_DCHECK_GE(output_val_s32, 0); 1985 TFLITE_DCHECK_LE(output_val_s32, 255); 1986 output_val = static_cast<uint8>(output_val_s32); 1987 } 1988 output_data[Offset(output_dims, c, x, y, b)] = output_val; 1989 } 1990 } 1991 } 1992 } 1993} 1994 1995inline void Tanh(const float* input_data, const Dims<4>& input_dims, 1996 float* output_data, const Dims<4>& output_dims) { 1997 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1998 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1999 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2000 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2001 for (int b = 0; b < batches; ++b) { 2002 for (int y = 0; y < height; ++y) { 2003 for (int x = 0; x < width; ++x) { 2004 for (int c = 0; c < depth; ++c) { 2005 float val = input_data[Offset(input_dims, c, x, y, b)]; 2006 float result = std::tanh(val); 2007 output_data[Offset(output_dims, c, x, y, b)] = result; 2008 } 2009 } 2010 } 2011 } 2012} 2013 2014inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, 2015 int32 zero_point, double scale, float* output_data, 2016 const Dims<4>& output_dims) { 2017 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2018 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2019 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2020 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2021 for (int b = 0; b < batches; ++b) { 2022 for (int y = 0; y < height; ++y) { 2023 for (int x = 0; x < width; ++x) { 2024 for (int c = 0; c < depth; ++c) { 2025 int32 val = input_data[Offset(input_dims, c, x, y, b)]; 2026 float result = static_cast<float>(scale * (val - zero_point)); 2027 output_data[Offset(output_dims, c, x, y, b)] = result; 2028 } 2029 } 2030 } 2031 } 2032} 2033 2034inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, 2035 float rmin, float rmax, float* output_data, 2036 const Dims<4>& output_dims) { 2037 // 0 should always be a representable value. Let's assume that the initial 2038 // min,max range contains 0. 2039 TFLITE_DCHECK_LE(rmin, 0.); 2040 TFLITE_DCHECK_GE(rmax, 0.); 2041 2042 // Determine quantization parameters: zero_point, scale. 2043 using Integer = uint8; 2044 const Integer qmin = std::numeric_limits<Integer>::min(); 2045 const Integer qmax = std::numeric_limits<Integer>::max(); 2046 const float qmin_float = qmin; 2047 const float qmax_float = qmax; 2048 int32 zero_point = 0; 2049 float scale = 0.f; 2050 // If rmin==rmax, both must be zero per the above assertion, 2051 // so we are done. 2052 if (rmin != rmax) { 2053 // First determine the scale. 2054 scale = (rmax - rmin) / (qmax_float - qmin_float); 2055 2056 // Zero-point computation. 2057 // First the initial floating-point computation. The zero-point can be 2058 // determined from solving an affine equation for any known pair 2059 // (real value, corresponding quantized value). 2060 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 2061 // The arithmetic error on the zero point computed from either pair 2062 // will be roughly machine_epsilon * (sum of absolute values of terms) 2063 // so we want to use the variant that adds the smaller terms. 2064 const float zero_point_from_min = qmin_float - rmin / scale; 2065 const float zero_point_from_max = qmax_float - rmax / scale; 2066 const float zero_point_from_min_error = 2067 std::abs(qmin_float) + std::abs(rmin / scale); 2068 const float zero_point_from_max_error = 2069 std::abs(qmax_float) + std::abs(rmax / scale); 2070 2071 const float zero_point_float = 2072 zero_point_from_min_error < zero_point_from_max_error 2073 ? zero_point_from_min 2074 : zero_point_from_max; 2075 2076 // Now we need to nudge the zero point to be an integer 2077 // (our zero points are integer, and this is motivated by the requirement 2078 // to be able to represent the real value "0" exactly as a quantized value, 2079 // which is required in multiple places, for example in Im2col with SAME 2080 // padding). 2081 if (zero_point_float < qmin_float) { 2082 zero_point = qmin; 2083 } else if (zero_point_float > qmax_float) { 2084 zero_point = qmax; 2085 } else { 2086 zero_point = static_cast<int32>(TfLiteRound(zero_point_float)); 2087 } 2088 // The zero point should always be in the range of quantized value, 2089 // [qmin, qmax]. 2090 TFLITE_DCHECK_GE(zero_point, qmin); 2091 TFLITE_DCHECK_LE(zero_point, qmax); 2092 } 2093 2094 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2095 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2096 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2097 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2098 for (int b = 0; b < batches; ++b) { 2099 for (int y = 0; y < height; ++y) { 2100 for (int x = 0; x < width; ++x) { 2101 for (int c = 0; c < depth; ++c) { 2102 const float src_val = input_data[Offset(input_dims, c, x, y, b)]; 2103 const float unclamped_quantized_val = 2104 TfLiteRound(zero_point + src_val / scale); 2105 const float quantized_val = std::min( 2106 qmax_float, std::max(qmin_float, unclamped_quantized_val)); 2107 const float dst_val = scale * (quantized_val - zero_point); 2108 output_data[Offset(output_dims, c, x, y, b)] = dst_val; 2109 } 2110 } 2111 } 2112 } 2113} 2114 2115template <typename SrcT, typename DstT> 2116inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, 2117 DstT* output_data, const Dims<4>& output_dims) { 2118 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2119 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2120 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2121 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2122 for (int b = 0; b < batches; ++b) { 2123 for (int y = 0; y < height; ++y) { 2124 for (int x = 0; x < width; ++x) { 2125 for (int c = 0; c < depth; ++c) { 2126 int offset = Offset(input_dims, c, x, y, b); 2127 output_data[offset] = static_cast<DstT>(input_data[offset]); 2128 } 2129 } 2130 } 2131 } 2132} 2133 2134inline void Floor(const float* input_data, const Dims<4>& input_dims, 2135 float* output_data, const Dims<4>& output_dims) { 2136 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2137 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2138 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2139 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2140 for (int b = 0; b < batches; ++b) { 2141 for (int y = 0; y < height; ++y) { 2142 for (int x = 0; x < width; ++x) { 2143 for (int c = 0; c < depth; ++c) { 2144 int offset = Offset(input_dims, c, x, y, b); 2145 output_data[offset] = std::floor(input_data[offset]); 2146 } 2147 } 2148 } 2149 } 2150} 2151 2152template <typename T> 2153inline void Gather(const T* input_data, const Dims<4>& input_dims, 2154 int input_rank, const int32* coords_data, 2155 const Dims<4>& coords_dims, T* output_data, 2156 const Dims<4>& output_dims) { 2157 TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); 2158 int stride = input_dims.strides[input_rank - 1]; 2159 T* out = output_data; 2160 2161 for (int i = 0; i < coords_dims.sizes[0]; i++) { 2162 TFLITE_DCHECK_GE(coords_data[i], 0); 2163 TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); 2164 const T* in = input_data + coords_data[i] * stride; 2165 memcpy(out, in, sizeof(T) * stride); 2166 out += stride; 2167 } 2168} 2169 2170inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 2171 const int32* output_size_data, 2172 const Dims<4>& output_size_dims, float* output_data, 2173 const Dims<4>& output_dims) { 2174 int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2175 int32 input_height = ArraySize(input_dims, 2); 2176 int32 input_width = ArraySize(input_dims, 1); 2177 int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2178 2179 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); 2180 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); 2181 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); 2182 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); 2183 int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; 2184 int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; 2185 float height_scale = static_cast<float>(input_height) / output_height; 2186 float width_scale = static_cast<float>(input_width) / output_width; 2187 2188 for (int b = 0; b < batches; ++b) { 2189 for (int y = 0; y < output_height; ++y) { 2190 float input_y = y * height_scale; 2191 int32 y0 = static_cast<int32>(std::floor(input_y)); 2192 int32 y1 = std::min(y0 + 1, input_height - 1); 2193 for (int x = 0; x < output_width; ++x) { 2194 float input_x = x * width_scale; 2195 int32 x0 = static_cast<int32>(std::floor(input_x)); 2196 int32 x1 = std::min(x0 + 1, input_width - 1); 2197 for (int c = 0; c < depth; ++c) { 2198 float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * 2199 (1 - (input_y - y0)) * 2200 (1 - (input_x - x0)) + 2201 input_data[Offset(input_dims, c, x0, y1, b)] * 2202 (input_y - y0) * (1 - (input_x - x0)) + 2203 input_data[Offset(input_dims, c, x1, y0, b)] * 2204 (1 - (input_y - y0)) * (input_x - x0) + 2205 input_data[Offset(input_dims, c, x1, y1, b)] * 2206 (input_y - y0) * (input_x - x0); 2207 output_data[Offset(output_dims, c, x, y, b)] = interpolation; 2208 } 2209 } 2210 } 2211 } 2212} 2213 2214template <typename T> 2215inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, 2216 const int32* block_shape_data, 2217 const Dims<4>& block_shape_dims, 2218 const int32* paddings_data, 2219 const Dims<4>& paddings_dims, T* output_data, 2220 const Dims<4>& output_dims) { 2221 const int output_batch_size = ArraySize(output_dims, 3); 2222 const int output_height = ArraySize(output_dims, 2); 2223 const int output_width = ArraySize(output_dims, 1); 2224 const int input_batch_size = ArraySize(input_dims, 3); 2225 const int input_height = ArraySize(input_dims, 2); 2226 const int input_width = ArraySize(input_dims, 1); 2227 const int depth = ArraySize(input_dims, 0); 2228 const int block_shape_height = block_shape_data[0]; 2229 const int block_shape_width = block_shape_data[1]; 2230 const int padding_top = paddings_data[0]; 2231 const int padding_left = paddings_data[2]; 2232 2233 for (int out_b = 0; out_b < output_batch_size; ++out_b) { 2234 int input_batch = out_b % input_batch_size; 2235 int shift_w = (out_b / input_batch_size) % block_shape_width; 2236 int shift_h = (out_b / input_batch_size) / block_shape_width; 2237 for (int out_h = 0; out_h < output_height; ++out_h) { 2238 for (int out_w = 0; out_w < output_width; ++out_w) { 2239 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); 2240 if (out_h * block_shape_height + shift_h < padding_top || 2241 out_h * block_shape_height + shift_h >= 2242 padding_top + input_height || 2243 out_w * block_shape_width + shift_w < padding_left || 2244 out_w * block_shape_width + shift_w >= padding_left + input_width) { 2245 memset(out, 0, depth * sizeof(T)); 2246 } else { 2247 const T* in = 2248 input_data + 2249 Offset(input_dims, 0, 2250 (out_w * block_shape_width + shift_w) - padding_left, 2251 (out_h * block_shape_height + shift_h) - padding_top, 2252 input_batch); 2253 memcpy(out, in, depth * sizeof(T)); 2254 } 2255 } 2256 } 2257 } 2258} 2259 2260template <typename T> 2261inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, 2262 const int32* block_shape_data, 2263 const Dims<4>& block_shape_dims, T* output_data, 2264 const Dims<4>& output_dims) { 2265 const int output_batch_size = ArraySize(output_dims, 3); 2266 const int input_batch_size = ArraySize(input_dims, 3); 2267 const int input_height = ArraySize(input_dims, 2); 2268 const int input_width = ArraySize(input_dims, 1); 2269 const int depth = ArraySize(input_dims, 0); 2270 const int block_shape_width = block_shape_data[1]; 2271 const int block_shape_height = block_shape_data[0]; 2272 2273 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) { 2274 for (int in_h = 0; in_h < input_height; ++in_h) { 2275 for (int in_w = 0; in_w < input_width; ++in_w) { 2276 int out_batch = in_batch % output_batch_size; 2277 int out_w = in_w * block_shape_width + 2278 (in_batch / output_batch_size) % block_shape_width; 2279 int out_h = in_h * block_shape_height + 2280 (in_batch / output_batch_size) / block_shape_width; 2281 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); 2282 const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); 2283 memcpy(out, in, depth * sizeof(T)); 2284 } 2285 } 2286 } 2287} 2288 2289template <typename T> 2290inline void Pad(const T* input_data, const Dims<4>& input_dims, 2291 const std::vector<int>& left_paddings, 2292 const std::vector<int>& right_paddings, T* output_data, 2293 const Dims<4>& output_dims) { 2294 const int output_batch = ArraySize(output_dims, 3); 2295 const int output_height = ArraySize(output_dims, 2); 2296 const int output_width = ArraySize(output_dims, 1); 2297 const int output_depth = ArraySize(output_dims, 0); 2298 2299 const int left_b_padding = left_paddings[3]; 2300 const int left_h_padding = left_paddings[2]; 2301 const int left_w_padding = left_paddings[1]; 2302 const int left_d_padding = left_paddings[0]; 2303 2304 const int right_b_padding = right_paddings[3]; 2305 const int right_h_padding = right_paddings[2]; 2306 const int right_w_padding = right_paddings[1]; 2307 const int right_d_padding = right_paddings[0]; 2308 2309 const T* in_ptr = input_data; 2310 T* out_ptr = output_data; 2311 for (int out_b = 0; out_b < output_batch; ++out_b) { 2312 for (int out_h = 0; out_h < output_height; ++out_h) { 2313 for (int out_w = 0; out_w < output_width; ++out_w) { 2314 for (int out_d = 0; out_d < output_depth; ++out_d) { 2315 if (out_b < left_b_padding || 2316 out_b >= output_batch - right_b_padding || 2317 out_h < left_h_padding || 2318 out_h >= output_height - right_h_padding || 2319 out_w < left_w_padding || 2320 out_w >= output_width - right_w_padding || 2321 out_d < left_d_padding || 2322 out_d >= output_depth - right_d_padding) { 2323 *out_ptr++ = 0; 2324 } else { 2325 *out_ptr++ = *in_ptr++; 2326 } 2327 } 2328 } 2329 } 2330 } 2331} 2332 2333inline bool LoopCondition(int index, int stop, int stride) { 2334 return stride > 0 ? index < stop : index > stop; 2335} 2336 2337inline int StartIndex(int start, int stride, int dim, bool masked) { 2338 return masked ? (stride > 0 ? 0 : dim - 1) : start; 2339} 2340 2341inline int StopIndex(int stop, int stride, int dim, bool masked) { 2342 return masked ? (stride > 0 ? dim : -1) : stop; 2343} 2344 2345template <typename T> 2346inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, 2347 int begin_mask, int end_mask, 2348 const std::vector<int>& starts, 2349 const std::vector<int>& stops, 2350 const std::vector<int>& strides, T* output_data, 2351 const Dims<4>& output_dims) { 2352 TFLITE_DCHECK_EQ(starts.size(), 4); 2353 TFLITE_DCHECK_EQ(stops.size(), 4); 2354 TFLITE_DCHECK_EQ(strides.size(), 4); 2355 const int start_b = 2356 StartIndex(starts[3], strides[3], input_dims.sizes[3], begin_mask & 8); 2357 const int stop_b = 2358 StopIndex(stops[3], strides[3], input_dims.sizes[3], end_mask & 8); 2359 const int start_h = 2360 StartIndex(starts[2], strides[2], input_dims.sizes[2], begin_mask & 4); 2361 const int stop_h = 2362 StopIndex(stops[2], strides[2], input_dims.sizes[2], end_mask & 4); 2363 const int start_w = 2364 StartIndex(starts[1], strides[1], input_dims.sizes[1], begin_mask & 2); 2365 const int stop_w = 2366 StopIndex(stops[1], strides[1], input_dims.sizes[1], end_mask & 2); 2367 const int start_d = 2368 StartIndex(starts[0], strides[0], input_dims.sizes[0], begin_mask & 1); 2369 const int stop_d = 2370 StopIndex(stops[0], strides[0], input_dims.sizes[0], end_mask & 1); 2371 2372 T* out_ptr = output_data; 2373 for (int in_b = start_b; LoopCondition(in_b, stop_b, strides[3]); 2374 in_b += strides[3]) { 2375 for (int in_h = start_h; LoopCondition(in_h, stop_h, strides[2]); 2376 in_h += strides[2]) { 2377 for (int in_w = start_w; LoopCondition(in_w, stop_w, strides[1]); 2378 in_w += strides[1]) { 2379 for (int in_d = start_d; LoopCondition(in_d, stop_d, strides[0]); 2380 in_d += strides[0]) { 2381 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2382 } 2383 } 2384 } 2385 } 2386} 2387 2388template <typename T> 2389inline void Slice(const T* input_data, const Dims<4>& input_dims, 2390 const std::vector<int>& begin, const std::vector<int>& size, 2391 T* output_data, const Dims<4>& output_dims) { 2392 // TODO(dkalenichenko): This op only supports 4D tensors. 2393 TFLITE_DCHECK_EQ(begin.size(), 4); 2394 TFLITE_DCHECK_EQ(size.size(), 4); 2395 const int start_b = begin[3]; 2396 const int stop_b = 2397 size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; 2398 const int start_h = begin[2]; 2399 const int stop_h = 2400 size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2]; 2401 const int start_w = begin[1]; 2402 const int stop_w = 2403 size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1]; 2404 const int start_d = begin[0]; 2405 const int stop_d = 2406 size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; 2407 2408 T* out_ptr = output_data; 2409 for (int in_b = start_b; in_b < stop_b; ++in_b) { 2410 for (int in_h = start_h; in_h < stop_h; ++in_h) { 2411 for (int in_w = start_w; in_w < stop_w; ++in_w) { 2412 for (int in_d = start_d; in_d < stop_d; ++in_d) { 2413 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2414 } 2415 } 2416 } 2417 } 2418} 2419 2420template <typename T> 2421inline void Mean(T* input_data, const int* input_dims, const int input_num_dims, 2422 T* output_data, const int* output_dims, 2423 const int output_num_dims, const int* axis, 2424 const int num_axis_dimensions, bool keep_dims, int* temp_index, 2425 int* resolved_axis) { 2426 // resets output data. 2427 size_t num_outputs = 1; 2428 for (int idx = 0; idx < output_num_dims; ++idx) { 2429 num_outputs *= static_cast<size_t>(output_dims[idx]); 2430 } 2431 for (size_t idx = 0; idx < num_outputs; ++idx) { 2432 output_data[idx] = 0; 2433 } 2434 // resets temp index. 2435 for (int idx = 0; idx < input_num_dims; ++idx) { 2436 temp_index[idx] = 0; 2437 } 2438 // resolves axis. 2439 int num_resolved_axis = 0; 2440 for (int idx = 0; idx < num_axis_dimensions; ++idx) { 2441 int current = axis[idx]; 2442 TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0); 2443 if (current < 0) { 2444 current += input_num_dims; 2445 } 2446 bool is_dup = false; 2447 for (int j = 0; j < num_resolved_axis; ++j) { 2448 if (resolved_axis[j] == current) { 2449 is_dup = true; 2450 break; 2451 } 2452 } 2453 if (!is_dup) { 2454 resolved_axis[num_resolved_axis++] = current; 2455 } 2456 } 2457 // iterates through input_data. 2458 for (bool has_next = true; has_next; 2459 has_next = NextIndex(input_num_dims, input_dims, temp_index)) { 2460 size_t input_offset = 2461 ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr); 2462 size_t output_offset = 2463 ReducedOutputOffset(input_num_dims, input_dims, temp_index, 2464 num_resolved_axis, resolved_axis); 2465 output_data[output_offset] += input_data[input_offset]; 2466 } 2467 // takes average by num of elements added to get mean. 2468 size_t num_elements_in_axis = 1; 2469 for (int idx = 0; idx < num_resolved_axis; ++idx) { 2470 num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]); 2471 } 2472 for (size_t idx = 0; idx < num_outputs; ++idx) { 2473 output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) / 2474 num_elements_in_axis); 2475 } 2476} 2477 2478template <typename T> 2479inline void Mean(const T* input_data, const Dims<4>& input_dims, 2480 const std::vector<int>& reduction_indices, T* output_data, 2481 const Dims<4>& output_dims) { 2482 const int output_batch = ArraySize(output_dims, 3); 2483 const int output_height = ArraySize(output_dims, 2); 2484 const int output_width = ArraySize(output_dims, 1); 2485 const int output_depth = ArraySize(output_dims, 0); 2486 2487 const int input_height = ArraySize(input_dims, 2); 2488 const int input_width = ArraySize(input_dims, 1); 2489 2490 // The current implementation only supports simultaneous reduction over 2491 // width and height. 2492 TFLITE_DCHECK_EQ(reduction_indices.size(), 2); 2493 TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || 2494 (reduction_indices[0] == 2 && reduction_indices[1] == 1)); 2495 TFLITE_DCHECK_EQ(output_height, 1); 2496 TFLITE_DCHECK_EQ(output_width, 1); 2497 2498 for (int out_b = 0; out_b < output_batch; ++out_b) { 2499 for (int out_d = 0; out_d < output_depth; ++out_d) { 2500 float value = 0; 2501 for (int in_h = 0; in_h < input_height; ++in_h) { 2502 for (int in_w = 0; in_w < input_width; ++in_w) { 2503 value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; 2504 } 2505 } 2506 output_data[Offset(output_dims, out_d, 0, 0, out_b)] = 2507 value / (input_width * input_height); 2508 } 2509 } 2510} 2511 2512template <typename T> 2513void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, 2514 const Dims<4>& input2_dims, T* output_data, 2515 const Dims<4>& output_dims) { 2516 NdArrayDesc<4> desc1; 2517 NdArrayDesc<4> desc2; 2518 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 2519 2520 // In Tensorflow, the dimensions are canonically named (batch_number, row, 2521 // col, channel), with extents (batches, height, width, depth), with the 2522 // trailing dimension changing most rapidly (channels has the smallest stride, 2523 // typically 1 element). 2524 // 2525 // In generated C code, we store arrays with the dimensions reversed. The 2526 // first dimension has smallest stride. 2527 // 2528 // We name our variables by their Tensorflow convention, but generate C code 2529 // nesting loops such that the innermost loop has the smallest stride for the 2530 // best cache behavior. 2531 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 2532 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 2533 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 2534 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 2535 output_data[Offset(output_dims, c, x, y, b)] = 2536 input1_data[SubscriptToIndex(desc1, c, x, y, b)] - 2537 input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 2538 } 2539 } 2540 } 2541 } 2542} 2543 2544template <typename T> 2545void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, 2546 const T* input2_data, T* output_data, 2547 const Dims<4>& output_dims) { 2548 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2549 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2550 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2551 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2552 2553 auto min_value = input2_data[0]; 2554 2555 for (int b = 0; b < batches; b++) { 2556 for (int y = 0; y < input_height; y++) { 2557 for (int x = 0; x < input_width; x++) { 2558 for (int c = 0; c < depth; c++) { 2559 int offset = Offset(input1_dims, c, x, y, b); 2560 output_data[offset] = 2561 input1_data[offset] > min_value ? min_value : input1_data[offset]; 2562 } 2563 } 2564 } 2565 } 2566} 2567 2568template <typename T> 2569void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, 2570 const T* input2_data, T* output_data, 2571 const Dims<4>& output_dims) { 2572 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2573 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2574 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2575 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2576 2577 auto max_value = input2_data[0]; 2578 2579 for (int b = 0; b < batches; b++) { 2580 for (int y = 0; y < input_height; y++) { 2581 for (int x = 0; x < input_width; x++) { 2582 for (int c = 0; c < depth; c++) { 2583 int offset = Offset(input1_dims, c, x, y, b); 2584 output_data[offset] = 2585 input1_data[offset] < max_value ? max_value : input1_data[offset]; 2586 } 2587 } 2588 } 2589 } 2590} 2591 2592template <typename T1, typename T2, typename T3> 2593void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, 2594 T2* output_data, const Dims<4>& output_dims) { 2595 // The current ArgMax implemention can only determine the index of the maximum 2596 // value in the last dimension. So the axis argument is ignored. 2597 TFLITE_DCHECK_EQ(axis[0], 3); 2598 2599 // For ArgMax, the number of output dimensions = (number of input dimensions - 2600 // 1). For the sake of simplicity, the output dimensions are equal to the 2601 // input dimensions here. We enforce the constraint that the last dimension 2602 // must always be 1. 2603 TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); 2604 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2605 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2606 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2607 const int depth = ArraySize(input_dims, 0); 2608 for (int b = 0; b < batches; ++b) { 2609 for (int y = 0; y < height; ++y) { 2610 for (int x = 0; x < width; ++x) { 2611 auto max_value = input_data[Offset(input_dims, 0, x, y, b)]; 2612 int max_index = 0; 2613 for (int d = 1; d < depth; ++d) { 2614 const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)]; 2615 if (curr_value > max_value) { 2616 max_value = curr_value; 2617 max_index = d; 2618 } 2619 } 2620 output_data[Offset(output_dims, 0, x, y, b)] = max_index; 2621 } 2622 } 2623 } 2624} 2625 2626template <typename T> 2627void Transpose(const T* input, const Dims<4>& input_dims, T* output, 2628 const Dims<4>& output_dims, int* permuted_axes) { 2629 int out_sizes[4]; 2630 // Compute the inverse permutation array so we can do an output centered 2631 // transpose. Also, check to make sure output_dims is matching input_dims. 2632 for (int k = 0; k < 4; k++) { 2633 out_sizes[k] = 2634 MatchingArraySize(input_dims, permuted_axes[k], output_dims, k); 2635 } 2636 2637 // Naive transpose loop (iterate on output index and compute input index). 2638 int o[4]; // loop index (on output). 2639 int i[4]; 2640 for (o[3] = 0; o[3] < out_sizes[3]; o[3]++) { 2641 i[permuted_axes[3]] = o[3]; 2642 for (o[2] = 0; o[2] < out_sizes[2]; o[2]++) { 2643 i[permuted_axes[2]] = o[2]; 2644 for (o[1] = 0; o[1] < out_sizes[1]; o[1]++) { 2645 i[permuted_axes[1]] = o[1]; 2646 for (o[0] = 0; o[0] < out_sizes[0]; o[0]++) { 2647 i[permuted_axes[0]] = o[0]; 2648 output[Offset(output_dims, o)] = input[Offset(input_dims, i)]; 2649 } 2650 } 2651 } 2652 } 2653} 2654 2655} // namespace reference_ops 2656} // namespace tflite 2657 2658#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 2659