reference_ops.h revision 9393d604ebd63664a27d28617f95fa5f60495270
1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15#ifndef THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 16#define THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 17 18#include <stdint.h> 19#include <sys/types.h> 20#include <algorithm> 21#include <cmath> 22#include <limits> 23#include <memory> 24#include <type_traits> 25 26#include "third_party/eigen3/Eigen/Core" 27#include "fixedpoint/fixedpoint.h" 28#include "public/gemmlowp.h" 29#include "tensorflow/contrib/lite/kernels/internal/common.h" 30#include "tensorflow/contrib/lite/kernels/internal/round.h" 31#include "tensorflow/contrib/lite/kernels/internal/types.h" 32 33namespace tflite { 34namespace reference_ops { 35 36inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( 37 int32 x, int32 quantized_multiplier, int right_shift) { 38 using gemmlowp::RoundingDivideByPOT; 39 using gemmlowp::SaturatingRoundingDoublingHighMul; 40 return RoundingDivideByPOT( 41 SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); 42} 43 44inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( 45 int32 x, int32 quantized_multiplier, int left_shift) { 46 using gemmlowp::SaturatingRoundingDoublingHighMul; 47 return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), 48 quantized_multiplier); 49} 50 51template <typename T> 52int CountLeadingZeros(T integer_input) { 53 static_assert(std::is_unsigned<T>::value, 54 "Only unsigned integer types handled."); 55 const T one_in_leading_positive = static_cast<T>(1) 56 << (std::numeric_limits<T>::digits - 1); 57 int leading_zeros = 0; 58 while (integer_input < one_in_leading_positive) { 59 integer_input <<= 1; 60 ++leading_zeros; 61 } 62 return leading_zeros; 63} 64 65// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE 66// BROADCASTING. 67// 68// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional 69// rectangular array of numbers. 70// 71// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. 72// However, as Dims<N> is to be deprecated, this class exists as an adaptor 73// to enable simple unoptimized implementations of element-wise broadcasting 74// operations. 75template <int N> 76struct NdArrayDesc { 77 // The "extent" of each dimension. Indices along dimension d must be in the 78 // half-open interval [0, extents[d]). 79 int extents[N]; 80 81 // The number of *elements* (not bytes) between consecutive indices of each 82 // dimension. 83 int strides[N]; 84}; 85 86// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING 87// ELEMENT-WISE BROADCASTING. 88// 89// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. 90inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, 91 int i3) { 92 TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); 93 TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); 94 TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); 95 TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); 96 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + 97 i3 * desc.strides[3]; 98} 99 100// Given the dimensions of the operands for an element-wise binary broadcast, 101// adjusts them so that they can be directly iterated over with simple loops. 102// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and 103// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. 104// 105// This function assumes that the two input shapes are compatible up to 106// broadcasting and the shorter one has already been prepended with 1s to be the 107// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), 108// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that 109// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be 110// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). 111// 112// When two shapes are compatible up to broadcasting, for each dimension d, 113// the input extents are either equal, or one of them is 1. 114// 115// This function performs the following for each dimension d: 116// - If the extents are equal, then do nothing since the loop that walks over 117// both of the input arrays is correct. 118// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 119// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows 120// array0 to be referenced *at any index* in dimension d and still access the 121// same slice. 122template <int N> 123inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, 124 const Dims<N>& input1_dims, 125 NdArrayDesc<N>* desc0_out, 126 NdArrayDesc<N>* desc1_out) { 127 TFLITE_DCHECK(desc0_out != nullptr); 128 TFLITE_DCHECK(desc1_out != nullptr); 129 130 // Copy dims to desc. 131 for (int i = 0; i < N; ++i) { 132 desc0_out->extents[i] = input0_dims.sizes[i]; 133 desc0_out->strides[i] = input0_dims.strides[i]; 134 desc1_out->extents[i] = input1_dims.sizes[i]; 135 desc1_out->strides[i] = input1_dims.strides[i]; 136 } 137 138 // Walk over each dimension. If the extents are equal do nothing. 139 // Otherwise, set the desc with extent 1 to have extent equal to the other and 140 // stride 0. 141 for (int i = 0; i < N; ++i) { 142 const int extent0 = ArraySize(input0_dims, i); 143 const int extent1 = ArraySize(input1_dims, i); 144 if (extent0 != extent1) { 145 if (extent0 == 1) { 146 desc0_out->strides[i] = 0; 147 desc0_out->extents[i] = extent1; 148 } else { 149 TFLITE_DCHECK_EQ(extent1, 1); 150 desc1_out->strides[i] = 0; 151 desc1_out->extents[i] = extent0; 152 } 153 } 154 } 155} 156 157inline void Conv(const float* input_data, const Dims<4>& input_dims, 158 const float* filter_data, const Dims<4>& filter_dims, 159 const float* bias_data, const Dims<4>& bias_dims, 160 int stride_width, int stride_height, int pad_width, 161 int pad_height, float output_activation_min, 162 float output_activation_max, float* output_data, 163 const Dims<4>& output_dims, float* im2col_data, 164 const Dims<4>& im2col_dims) { 165 (void)im2col_data; // only used in optimized code. 166 (void)im2col_dims; // only used in optimized code. 167 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 168 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 169 const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0); 170 if (bias_data) { 171 TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0)); 172 } 173 const int input_height = ArraySize(input_dims, 2); 174 const int input_width = ArraySize(input_dims, 1); 175 const int filter_height = ArraySize(filter_dims, 2); 176 const int filter_width = ArraySize(filter_dims, 1); 177 const int output_height = ArraySize(output_dims, 2); 178 const int output_width = ArraySize(output_dims, 1); 179 for (int batch = 0; batch < batches; ++batch) { 180 for (int out_y = 0; out_y < output_height; ++out_y) { 181 for (int out_x = 0; out_x < output_width; ++out_x) { 182 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 183 const int in_x_origin = (out_x * stride_width) - pad_width; 184 const int in_y_origin = (out_y * stride_height) - pad_height; 185 float total = 0.f; 186 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 187 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 188 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 189 const int in_x = in_x_origin + filter_x; 190 const int in_y = in_y_origin + filter_y; 191 // If the location is outside the bounds of the input image, 192 // use zero as a default value. 193 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 194 (in_y < input_height)) { 195 float input_value = input_data[Offset(input_dims, in_channel, 196 in_x, in_y, batch)]; 197 float filter_value = 198 filter_data[Offset(filter_dims, in_channel, filter_x, 199 filter_y, out_channel)]; 200 total += (input_value * filter_value); 201 } 202 } 203 } 204 } 205 float bias_value = 0.0f; 206 if (bias_data) { 207 bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 208 } 209 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 210 ActivationFunctionWithMinMax(total + bias_value, 211 output_activation_min, 212 output_activation_max); 213 } 214 } 215 } 216 } 217} 218 219// legacy, for compatibility with old checked-in code 220template <FusedActivationFunctionType Ac> 221void Conv(const float* input_data, const Dims<4>& input_dims, 222 const float* filter_data, const Dims<4>& filter_dims, 223 const float* bias_data, const Dims<4>& bias_dims, int stride_width, 224 int stride_height, int pad_width, int pad_height, float* output_data, 225 const Dims<4>& output_dims, float* im2col_data, 226 const Dims<4>& im2col_dims) { 227 float output_activation_min, output_activation_max; 228 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 229 Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, 230 stride_width, stride_height, pad_width, pad_height, 231 output_activation_min, output_activation_max, output_data, output_dims, 232 im2col_data, im2col_dims); 233} 234 235// legacy, for compatibility with old checked-in code 236template <FusedActivationFunctionType Ac> 237void Conv(const float* input_data, const Dims<4>& input_dims, 238 const float* filter_data, const Dims<4>& filter_dims, 239 const float* bias_data, const Dims<4>& bias_dims, int stride, 240 int pad_width, int pad_height, float* output_data, 241 const Dims<4>& output_dims, float* im2col_data, 242 const Dims<4>& im2col_dims) { 243 Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, 244 bias_dims, stride, stride, pad_width, pad_height, output_data, 245 output_dims, im2col_data, im2col_dims); 246} 247 248inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 249 int32 input_offset, const uint8* filter_data, 250 const Dims<4>& filter_dims, int32 filter_offset, 251 const int32* bias_data, const Dims<4>& bias_dims, 252 int stride_width, int stride_height, int pad_width, 253 int pad_height, int32 output_offset, int32 output_multiplier, 254 int output_shift, int32 output_activation_min, 255 int32 output_activation_max, uint8* output_data, 256 const Dims<4>& output_dims, uint8* im2col_data, 257 const Dims<4>& im2col_dims, 258 gemmlowp::GemmContext* gemm_context) { 259 (void)im2col_data; // only used in optimized code. 260 (void)im2col_dims; // only used in optimized code. 261 (void)gemm_context; // only used in optimized code. 262 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 263 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 264 const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0); 265 const int output_depth = 266 MatchingArraySize(filter_dims, 3, bias_dims, 0, output_dims, 0); 267 const int input_height = ArraySize(input_dims, 2); 268 const int input_width = ArraySize(input_dims, 1); 269 const int filter_height = ArraySize(filter_dims, 2); 270 const int filter_width = ArraySize(filter_dims, 1); 271 const int output_height = ArraySize(output_dims, 2); 272 const int output_width = ArraySize(output_dims, 1); 273 for (int batch = 0; batch < batches; ++batch) { 274 for (int out_y = 0; out_y < output_height; ++out_y) { 275 for (int out_x = 0; out_x < output_width; ++out_x) { 276 for (int out_channel = 0; out_channel < output_depth; ++out_channel) { 277 const int in_x_origin = (out_x * stride_width) - pad_width; 278 const int in_y_origin = (out_y * stride_height) - pad_height; 279 int32 acc = 0; 280 for (int filter_y = 0; filter_y < filter_height; ++filter_y) { 281 for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 282 for (int in_channel = 0; in_channel < input_depth; ++in_channel) { 283 const int in_x = in_x_origin + filter_x; 284 const int in_y = in_y_origin + filter_y; 285 // If the location is outside the bounds of the input image, 286 // use zero as a default value. 287 if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && 288 (in_y < input_height)) { 289 int32 input_val = input_data[Offset(input_dims, in_channel, 290 in_x, in_y, batch)]; 291 int32 filter_val = 292 filter_data[Offset(filter_dims, in_channel, filter_x, 293 filter_y, out_channel)]; 294 acc += 295 (filter_val + filter_offset) * (input_val + input_offset); 296 } 297 } 298 } 299 } 300 if (bias_data) { 301 acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)]; 302 } 303 acc = MultiplyByQuantizedMultiplierSmallerThanOne( 304 acc, output_multiplier, output_shift); 305 acc += output_offset; 306 acc = std::max(acc, output_activation_min); 307 acc = std::min(acc, output_activation_max); 308 output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] = 309 static_cast<uint8>(acc); 310 } 311 } 312 } 313 } 314} 315 316// legacy, for compatibility with old checked-in code 317template <FusedActivationFunctionType Ac> 318inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 319 int32 input_offset, const uint8* filter_data, 320 const Dims<4>& filter_dims, int32 filter_offset, 321 const int32* bias_data, const Dims<4>& bias_dims, 322 int stride_width, int stride_height, int pad_width, 323 int pad_height, int32 output_offset, int32 output_multiplier, 324 int output_shift, int32 output_activation_min, 325 int32 output_activation_max, uint8* output_data, 326 const Dims<4>& output_dims, uint8* im2col_data, 327 const Dims<4>& im2col_dims, 328 gemmlowp::GemmContext* gemm_context) { 329 static_assert(Ac == FusedActivationFunctionType::kNone || 330 Ac == FusedActivationFunctionType::kRelu || 331 Ac == FusedActivationFunctionType::kRelu6 || 332 Ac == FusedActivationFunctionType::kRelu1, 333 ""); 334 if (Ac == FusedActivationFunctionType::kNone) { 335 TFLITE_DCHECK_EQ(output_activation_min, 0); 336 TFLITE_DCHECK_EQ(output_activation_max, 255); 337 } 338 Conv(input_data, input_dims, input_offset, filter_data, filter_dims, 339 filter_offset, bias_data, bias_dims, stride_width, stride_height, 340 pad_width, pad_height, output_offset, output_multiplier, output_shift, 341 output_activation_min, output_activation_max, output_data, output_dims, 342 im2col_data, im2col_dims, gemm_context); 343} 344 345// legacy, for compatibility with old checked-in code 346template <FusedActivationFunctionType Ac> 347void Conv(const uint8* input_data, const Dims<4>& input_dims, 348 int32 input_offset, const uint8* filter_data, 349 const Dims<4>& filter_dims, int32 filter_offset, 350 const int32* bias_data, const Dims<4>& bias_dims, int stride, 351 int pad_width, int pad_height, int32 output_offset, 352 int32 output_multiplier, int output_shift, 353 int32 output_activation_min, int32 output_activation_max, 354 uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, 355 const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { 356 Conv<Ac>(input_data, input_dims, input_offset, filter_data, filter_dims, 357 filter_offset, bias_data, bias_dims, stride, stride, pad_width, 358 pad_height, output_offset, output_multiplier, output_shift, 359 output_activation_min, output_activation_max, output_data, 360 output_dims, im2col_data, im2col_dims, gemm_context); 361} 362 363template <typename T> 364inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, 365 int block_size, T* output_data, 366 const Dims<4>& output_dims) { 367 const int input_depth = ArraySize(input_dims, 0); 368 const int input_width = ArraySize(input_dims, 1); 369 const int input_height = ArraySize(input_dims, 2); 370 const int input_batch = ArraySize(input_dims, 3); 371 372 const int output_depth = ArraySize(output_dims, 0); 373 const int output_width = ArraySize(output_dims, 1); 374 const int output_height = ArraySize(output_dims, 2); 375 const int output_batch = ArraySize(output_dims, 3); 376 377 TFLITE_DCHECK_EQ(input_width * block_size, output_width); 378 TFLITE_DCHECK_EQ(input_height * block_size, output_height); 379 TFLITE_DCHECK_EQ(input_depth, output_depth * block_size * block_size); 380 TFLITE_DCHECK_EQ(input_batch, output_batch); 381 382 for (int out_b = 0; out_b < output_batch; ++out_b) { 383 for (int out_h = 0; out_h < output_height; ++out_h) { 384 for (int out_w = 0; out_w < output_width; ++out_w) { 385 for (int out_d = 0; out_d < output_depth; ++out_d) { 386 const int in_d = 387 out_d + ((out_h % block_size) * block_size + out_w % block_size) * 388 output_depth; 389 const int in_w = out_w / block_size; 390 const int in_h = out_h / block_size; 391 const int in_b = out_b; 392 393 const int output_index = 394 Offset(output_dims, out_d, out_w, out_h, out_b); 395 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 396 397 output_data[output_index] = input_data[input_index]; 398 } 399 } 400 } 401 } 402} 403 404template <typename T> 405inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, 406 int block_size, T* output_data, 407 const Dims<4>& output_dims) { 408 const int input_depth = ArraySize(input_dims, 0); 409 const int input_width = ArraySize(input_dims, 1); 410 const int input_height = ArraySize(input_dims, 2); 411 const int input_batch = ArraySize(input_dims, 3); 412 413 const int output_depth = ArraySize(output_dims, 0); 414 const int output_width = ArraySize(output_dims, 1); 415 const int output_height = ArraySize(output_dims, 2); 416 const int output_batch = ArraySize(output_dims, 3); 417 418 TFLITE_DCHECK_EQ(input_width, output_width * block_size); 419 TFLITE_DCHECK_EQ(input_height, output_height * block_size); 420 TFLITE_DCHECK_EQ(input_depth * block_size * block_size, output_depth); 421 TFLITE_DCHECK_EQ(input_batch, output_batch); 422 423 for (int in_b = 0; in_b < input_batch; ++in_b) { 424 for (int in_h = 0; in_h < input_height; ++in_h) { 425 for (int in_w = 0; in_w < input_width; ++in_w) { 426 for (int in_d = 0; in_d < input_depth; ++in_d) { 427 const int out_d = 428 in_d + ((in_h % block_size) * block_size + in_w % block_size) * 429 input_depth; 430 const int out_w = in_w / block_size; 431 const int out_h = in_h / block_size; 432 const int out_b = in_b; 433 434 const int output_index = 435 Offset(output_dims, out_d, out_w, out_h, out_b); 436 const int input_index = Offset(input_dims, in_d, in_w, in_h, in_b); 437 438 output_data[output_index] = input_data[input_index]; 439 } 440 } 441 } 442 } 443} 444 445inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, 446 const float* weights_data, 447 const Dims<4>& weights_dims, const float* bias_data, 448 const Dims<4>& bias_dims, 449 float output_activation_min, 450 float output_activation_max, float* output_data, 451 const Dims<4>& output_dims) { 452 // TODO(benoitjacob): This really should be: 453 // const int batches = ArraySize(output_dims, 1); 454 // but the current --variable_batch hack consists in overwriting the 3rd 455 // dimension with the runtime batch size, as we don't keep track for each 456 // array of which dimension is the batch dimension in it. 457 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 458 ArraySize(output_dims, 3); 459 const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0); 460 const int accum_depth = ArraySize(weights_dims, 0); 461 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 462 TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); 463 for (int b = 0; b < batches; ++b) { 464 for (int out_c = 0; out_c < output_depth; ++out_c) { 465 float total = 0.f; 466 for (int d = 0; d < accum_depth; ++d) { 467 total += input_data[b * accum_depth + d] * 468 weights_data[out_c * accum_depth + d]; 469 } 470 float bias_value = 0.0f; 471 if (bias_data) { 472 bias_value = bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 473 } 474 output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax( 475 total + bias_value, output_activation_min, output_activation_max); 476 } 477 } 478} 479 480// legacy, for compatibility with old checked-in code 481template <FusedActivationFunctionType Ac> 482void FullyConnected(const float* input_data, const Dims<4>& input_dims, 483 const float* weights_data, const Dims<4>& weights_dims, 484 const float* bias_data, const Dims<4>& bias_dims, 485 float* output_data, const Dims<4>& output_dims) { 486 float output_activation_min, output_activation_max; 487 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 488 FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, 489 bias_dims, output_activation_min, output_activation_max, 490 output_data, output_dims); 491} 492 493inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 494 int32 input_offset, const uint8* filter_data, 495 const Dims<4>& filter_dims, int32 filter_offset, 496 const int32* bias_data, const Dims<4>& bias_dims, 497 int32 output_offset, int32 output_multiplier, 498 int output_shift, int32 output_activation_min, 499 int32 output_activation_max, uint8* output_data, 500 const Dims<4>& output_dims, 501 gemmlowp::GemmContext* gemm_context) { 502 (void)gemm_context; // only used in optimized code. 503 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 504 // TODO(benoitjacob): This really should be: 505 // const int batches = ArraySize(output_dims, 1); 506 // but the current --variable_batch hack consists in overwriting the 3rd 507 // dimension with the runtime batch size, as we don't keep track for each 508 // array of which dimension is the batch dimension in it. 509 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 510 ArraySize(output_dims, 3); 511 const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0); 512 const int accum_depth = ArraySize(filter_dims, 0); 513 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 514 TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); 515 for (int b = 0; b < batches; ++b) { 516 for (int out_c = 0; out_c < output_depth; ++out_c) { 517 int32 acc = 0; 518 for (int d = 0; d < accum_depth; ++d) { 519 int32 input_val = input_data[b * accum_depth + d]; 520 int32 filter_val = filter_data[out_c * accum_depth + d]; 521 acc += (filter_val + filter_offset) * (input_val + input_offset); 522 } 523 if (bias_data) { 524 acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)]; 525 } 526 acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier, 527 output_shift); 528 acc += output_offset; 529 acc = std::max(acc, output_activation_min); 530 acc = std::min(acc, output_activation_max); 531 output_data[out_c + output_depth * b] = static_cast<uint8>(acc); 532 } 533 } 534} 535 536// legacy, for compatibility with old checked-in code 537template <FusedActivationFunctionType Ac> 538void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 539 int32 input_offset, const uint8* filter_data, 540 const Dims<4>& filter_dims, int32 filter_offset, 541 const int32* bias_data, const Dims<4>& bias_dims, 542 int32 output_offset, int32 output_multiplier, 543 int output_shift, int32 output_activation_min, 544 int32 output_activation_max, uint8* output_data, 545 const Dims<4>& output_dims, 546 gemmlowp::GemmContext* gemm_context) { 547 static_assert(Ac == FusedActivationFunctionType::kNone || 548 Ac == FusedActivationFunctionType::kRelu || 549 Ac == FusedActivationFunctionType::kRelu6 || 550 Ac == FusedActivationFunctionType::kRelu1, 551 ""); 552 if (Ac == FusedActivationFunctionType::kNone) { 553 TFLITE_DCHECK_EQ(output_activation_min, 0); 554 TFLITE_DCHECK_EQ(output_activation_max, 255); 555 } 556 FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, 557 filter_offset, bias_data, bias_dims, output_offset, 558 output_multiplier, output_shift, output_activation_min, 559 output_activation_max, output_data, output_dims, gemm_context); 560} 561 562template <FusedActivationFunctionType Ac> 563void NonGlobalBatchNormalization( 564 const float* input_data, const Dims<4>& input_dims, const float* mean_data, 565 const Dims<4>& mean_dims, const float* multiplier_data, 566 const Dims<4>& multiplier_dims, const float* offset_data, 567 const Dims<4>& offset_dims, float* output_data, 568 const Dims<4>& output_dims) { 569 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 570 const int height = 571 MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, 572 offset_dims, 2, output_dims, 2); 573 const int width = 574 MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, 575 offset_dims, 1, output_dims, 1); 576 const int depth = 577 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 578 offset_dims, 0, output_dims, 0); 579 580 for (int b = 0; b < batches; ++b) { 581 for (int y = 0; y < height; ++y) { 582 for (int x = 0; x < width; ++x) { 583 for (int c = 0; c < depth; ++c) { 584 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 585 (input_data[Offset(input_dims, c, x, y, b)] - 586 mean_data[Offset(mean_dims, c, x, y, 0)]) * 587 multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + 588 offset_data[Offset(offset_dims, c, x, y, 0)]); 589 } 590 } 591 } 592 } 593} 594 595template <FusedActivationFunctionType Ac> 596void GlobalBatchNormalization(const float* input_data, 597 const Dims<4>& input_dims, const float* mean_data, 598 const Dims<4>& mean_dims, 599 const float* multiplier_data, 600 const Dims<4>& multiplier_dims, 601 const float* offset_data, 602 const Dims<4>& offset_dims, float* output_data, 603 const Dims<4>& output_dims) { 604 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 605 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 606 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 607 const int depth = 608 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 609 offset_dims, 0, output_dims, 0); 610 611 for (int b = 0; b < batches; ++b) { 612 for (int y = 0; y < height; ++y) { 613 for (int x = 0; x < width; ++x) { 614 for (int c = 0; c < depth; ++c) { 615 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 616 (input_data[Offset(input_dims, c, x, y, b)] - 617 mean_data[Offset(mean_dims, c, 0, 0, 0)]) * 618 multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + 619 offset_data[Offset(offset_dims, c, 0, 0, 0)]); 620 } 621 } 622 } 623 } 624} 625 626inline void Relu(const float* input_data, const Dims<4>& input_dims, 627 float* output_data, const Dims<4>& output_dims) { 628 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 629 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 630 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 631 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 632 for (int b = 0; b < batches; ++b) { 633 for (int y = 0; y < height; ++y) { 634 for (int x = 0; x < width; ++x) { 635 for (int c = 0; c < depth; ++c) { 636 float val = input_data[Offset(input_dims, c, x, y, b)]; 637 const float lower = 0; 638 float clamped = val < lower ? lower : val; 639 output_data[Offset(output_dims, c, x, y, b)] = clamped; 640 } 641 } 642 } 643 } 644} 645 646inline void Relu1(const float* input_data, const Dims<4>& input_dims, 647 float* output_data, const Dims<4>& output_dims) { 648 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 649 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 650 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 651 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 652 for (int b = 0; b < batches; ++b) { 653 for (int y = 0; y < height; ++y) { 654 for (int x = 0; x < width; ++x) { 655 for (int c = 0; c < depth; ++c) { 656 float val = input_data[Offset(input_dims, c, x, y, b)]; 657 const float upper = 1; 658 const float lower = -1; 659 float clamped = val > upper ? upper : val < lower ? lower : val; 660 output_data[Offset(output_dims, c, x, y, b)] = clamped; 661 } 662 } 663 } 664 } 665} 666 667inline void Relu6(const float* input_data, const Dims<4>& input_dims, 668 float* output_data, const Dims<4>& output_dims) { 669 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 670 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 671 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 672 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 673 for (int b = 0; b < batches; ++b) { 674 for (int y = 0; y < height; ++y) { 675 for (int x = 0; x < width; ++x) { 676 for (int c = 0; c < depth; ++c) { 677 float val = input_data[Offset(input_dims, c, x, y, b)]; 678 const float upper = 6; 679 const float lower = 0; 680 float clamped = val > upper ? upper : val < lower ? lower : val; 681 output_data[Offset(output_dims, c, x, y, b)] = clamped; 682 } 683 } 684 } 685 } 686} 687 688template <FusedActivationFunctionType Ac> 689void L2Normalization(const float* input_data, const Dims<4>& input_dims, 690 float* output_data, const Dims<4>& output_dims) { 691 static_assert(Ac == FusedActivationFunctionType::kNone, ""); 692 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 693 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 694 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 695 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 696 for (int b = 0; b < batches; ++b) { 697 for (int y = 0; y < height; ++y) { 698 for (int x = 0; x < width; ++x) { 699 float squared_l2_norm = 0; 700 for (int c = 0; c < depth; ++c) { 701 float val = input_data[Offset(input_dims, c, x, y, b)]; 702 squared_l2_norm += val * val; 703 } 704 float l2_norm = std::sqrt(squared_l2_norm); 705 for (int c = 0; c < depth; ++c) { 706 output_data[Offset(output_dims, c, x, y, b)] = 707 input_data[Offset(input_dims, c, x, y, b)] / l2_norm; 708 } 709 } 710 } 711 } 712} 713 714inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, 715 int* output_shift) { 716 *output_shift = 11; 717 while (input >= (1 << 29)) { 718 input /= 4; 719 ++*output_shift; 720 } 721 TFLITE_DCHECK_GT(input, 0); 722 const unsigned max_left_shift_bits = __builtin_clz(input) - 1; 723 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; 724 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; 725 *output_shift -= left_shift_bit_pairs; 726 input <<= 2 * left_shift_bit_pairs; 727 TFLITE_DCHECK_GE(input, (1 << 27)); 728 TFLITE_DCHECK_LT(input, (1 << 29)); 729 using gemmlowp::FixedPoint; 730 using gemmlowp::Rescale; 731 using gemmlowp::SaturatingRoundingMultiplyByPOT; 732 // Using 3 integer bits gives us enough room for the internal arithmetic in 733 // this Newton-Raphson iteration. 734 using F3 = FixedPoint<int32, 3>; 735 using F0 = FixedPoint<int32, 0>; 736 const F3 fixedpoint_input = F3::FromRaw(input >> 1); 737 const F3 fixedpoint_half_input = 738 SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); 739 const F3 fixedpoint_half_three = 740 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); 741 // Newton-Raphson iteration 742 // Naive unoptimized starting guess: x = 1 743 F3 x = F3::One(); 744 // Naive unoptimized number of iterations: 5 745 for (int i = 0; i < 5; i++) { 746 const F3 x3 = Rescale<3>(x * x * x); 747 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); 748 } 749 const F0 fixedpoint_half_sqrt_2 = 750 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); 751 x = x * fixedpoint_half_sqrt_2; 752 *output_inv_sqrt = x.raw(); 753 if (*output_shift < 0) { 754 *output_inv_sqrt <<= -*output_shift; 755 *output_shift = 0; 756 } 757} 758 759inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, 760 int32 input_zero_point, uint8* output_data, 761 const Dims<4>& output_dims) { 762 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 763 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 764 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 765 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 766 TFLITE_DCHECK_EQ(batches, 1); 767 TFLITE_DCHECK_EQ(height, 1); 768 TFLITE_DCHECK_EQ(width, 1); 769 int32 square_l2_norm = 0; 770 for (int i = 0; i < depth; i++) { 771 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 772 square_l2_norm += diff * diff; 773 } 774 int32 inv_l2norm_multiplier; 775 int inv_l2norm_shift; 776 GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, 777 &inv_l2norm_shift); 778 779 for (int i = 0; i < depth; i++) { 780 int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point; 781 int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( 782 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); 783 int32 unclamped_output_val = 128 + rescaled_diff; 784 int32 output_val = std::min(255, std::max(0, unclamped_output_val)); 785 output_data[Offset(output_dims, i, 0, 0, 0)] = 786 static_cast<uint8>(output_val); 787 } 788} 789 790inline void Add(const float* input1_data, const Dims<4>& input1_dims, 791 const float* input2_data, const Dims<4>& input2_dims, 792 float output_activation_min, float output_activation_max, 793 float* output_data, const Dims<4>& output_dims) { 794 const int batches = 795 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 796 const int height = 797 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 798 const int width = 799 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 800 const int depth = 801 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 802 for (int b = 0; b < batches; ++b) { 803 for (int y = 0; y < height; ++y) { 804 for (int x = 0; x < width; ++x) { 805 for (int c = 0; c < depth; ++c) { 806 output_data[Offset(output_dims, c, x, y, b)] = 807 ActivationFunctionWithMinMax( 808 input1_data[Offset(input1_dims, c, x, y, b)] + 809 input2_data[Offset(input2_dims, c, x, y, b)], 810 output_activation_min, output_activation_max); 811 } 812 } 813 } 814 } 815} 816 817// legacy, for compatibility with old checked-in code 818template <FusedActivationFunctionType Ac> 819void Add(const float* input1_data, const Dims<4>& input1_dims, 820 const float* input2_data, const Dims<4>& input2_dims, 821 float* output_data, const Dims<4>& output_dims) { 822 float output_activation_min, output_activation_max; 823 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 824 825 Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 826 output_activation_max, output_data, output_dims); 827} 828 829template <FusedActivationFunctionType Ac> 830inline void Add(int left_shift, const uint8* input1_data, 831 const Dims<4>& input1_dims, int32 input1_offset, 832 int32 input1_multiplier, int input1_shift, 833 const uint8* input2_data, const Dims<4>& input2_dims, 834 int32 input2_offset, int32 input2_multiplier, int input2_shift, 835 int32 output_offset, int32 output_multiplier, int output_shift, 836 int32 output_activation_min, int32 output_activation_max, 837 uint8* output_data, const Dims<4>& output_dims) { 838 static_assert(Ac == FusedActivationFunctionType::kNone || 839 Ac == FusedActivationFunctionType::kRelu || 840 Ac == FusedActivationFunctionType::kRelu6 || 841 Ac == FusedActivationFunctionType::kRelu1, 842 ""); 843 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 844 if (Ac == FusedActivationFunctionType::kNone) { 845 TFLITE_DCHECK_EQ(output_activation_min, 0); 846 TFLITE_DCHECK_EQ(output_activation_max, 255); 847 } 848 const int batches = 849 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 850 const int height = 851 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 852 const int width = 853 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 854 const int depth = 855 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 856 for (int b = 0; b < batches; ++b) { 857 for (int y = 0; y < height; ++y) { 858 for (int x = 0; x < width; ++x) { 859 for (int c = 0; c < depth; ++c) { 860 const int32 input1_val = 861 input1_offset + input1_data[Offset(input1_dims, c, x, y, b)]; 862 const int32 input2_val = 863 input2_offset + input2_data[Offset(input2_dims, c, x, y, b)]; 864 const int32 shifted_input1_val = input1_val * (1 << left_shift); 865 const int32 shifted_input2_val = input2_val * (1 << left_shift); 866 const int32 scaled_input1_val = 867 MultiplyByQuantizedMultiplierSmallerThanOne( 868 shifted_input1_val, input1_multiplier, input1_shift); 869 const int32 scaled_input2_val = 870 MultiplyByQuantizedMultiplierSmallerThanOne( 871 shifted_input2_val, input2_multiplier, input2_shift); 872 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 873 const int32 raw_output = 874 MultiplyByQuantizedMultiplierSmallerThanOne( 875 raw_sum, output_multiplier, output_shift) + 876 output_offset; 877 const int32 clamped_output = 878 std::min(output_activation_max, 879 std::max(output_activation_min, raw_output)); 880 output_data[Offset(output_dims, c, x, y, b)] = 881 static_cast<uint8>(clamped_output); 882 } 883 } 884 } 885 } 886} 887 888// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary 889// dimensionality if the runtime code does a single loop over one dimension 890// that handles broadcasting as the base case. The code generator would then 891// generate max(D1, D2) nested for loops. 892template <FusedActivationFunctionType Ac> 893void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims, 894 const float* input2_data, const Dims<4>& input2_dims, 895 float* output_data, const Dims<4>& output_dims) { 896 gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); 897 898 NdArrayDesc<4> desc1; 899 NdArrayDesc<4> desc2; 900 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 901 902 // In Tensorflow, the dimensions are canonically named (batch_number, row, 903 // col, channel), with extents (batches, height, width, depth), with the 904 // trailing dimension changing most rapidly (channels has the smallest stride, 905 // typically 1 element). 906 // 907 // In generated C code, we store arrays with the dimensions reversed. The 908 // first dimension has smallest stride. 909 // 910 // We name our variables by their Tensorflow convention, but generate C code 911 // nesting loops such that the innermost loop has the smallest stride for the 912 // best cache behavior. 913 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 914 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 915 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 916 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 917 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 918 input1_data[SubscriptToIndex(desc1, c, x, y, b)] + 919 input2_data[SubscriptToIndex(desc2, c, x, y, b)]); 920 } 921 } 922 } 923 } 924} 925 926inline void BroadcastAdd(int left_shift, const uint8* input1_data, 927 const Dims<4>& input1_dims, int32 input1_offset, 928 int32 input1_multiplier, int input1_shift, 929 const uint8* input2_data, const Dims<4>& input2_dims, 930 int32 input2_offset, int32 input2_multiplier, 931 int input2_shift, int32 output_offset, 932 int32 output_multiplier, int output_shift, 933 int32 output_activation_min, 934 int32 output_activation_max, uint8* output_data, 935 const Dims<4>& output_dims) { 936 gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); 937 938 NdArrayDesc<4> desc1; 939 NdArrayDesc<4> desc2; 940 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 941 942 // In Tensorflow, the dimensions are canonically named (batch_number, row, 943 // col, channel), with extents (batches, height, width, depth), with the 944 // trailing dimension changing most rapidly (channels has the smallest stride, 945 // typically 1 element). 946 // 947 // In generated C code, we store arrays with the dimensions reversed. The 948 // first dimension has smallest stride. 949 // 950 // We name our variables by their Tensorflow convention, but generate C code 951 // nesting loops such that the innermost loop has the smallest stride for the 952 // best cache behavior. 953 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 954 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 955 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 956 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 957 const int32 input1_val = 958 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 959 const int32 input2_val = 960 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 961 const int32 shifted_input1_val = input1_val * (1 << left_shift); 962 const int32 shifted_input2_val = input2_val * (1 << left_shift); 963 const int32 scaled_input1_val = 964 MultiplyByQuantizedMultiplierSmallerThanOne( 965 shifted_input1_val, input1_multiplier, input1_shift); 966 const int32 scaled_input2_val = 967 MultiplyByQuantizedMultiplierSmallerThanOne( 968 shifted_input2_val, input2_multiplier, input2_shift); 969 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 970 const int32 raw_output = 971 MultiplyByQuantizedMultiplierSmallerThanOne( 972 raw_sum, output_multiplier, output_shift) + 973 output_offset; 974 const int32 clamped_output = 975 std::min(output_activation_max, 976 std::max(output_activation_min, raw_output)); 977 output_data[Offset(output_dims, c, x, y, b)] = 978 static_cast<uint8>(clamped_output); 979 } 980 } 981 } 982 } 983} 984 985template <FusedActivationFunctionType Ac> 986inline void BroadcastAdd(int left_shift, const uint8* input1_data, 987 const Dims<4>& input1_dims, int32 input1_offset, 988 int32 input1_multiplier, int input1_shift, 989 const uint8* input2_data, const Dims<4>& input2_dims, 990 int32 input2_offset, int32 input2_multiplier, 991 int input2_shift, int32 output_offset, 992 int32 output_multiplier, int output_shift, 993 int32 output_activation_min, 994 int32 output_activation_max, uint8* output_data, 995 const Dims<4>& output_dims) { 996 static_assert(Ac == FusedActivationFunctionType::kNone || 997 Ac == FusedActivationFunctionType::kRelu || 998 Ac == FusedActivationFunctionType::kRelu6 || 999 Ac == FusedActivationFunctionType::kRelu1, 1000 ""); 1001 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1002 if (Ac == FusedActivationFunctionType::kNone) { 1003 TFLITE_DCHECK_EQ(output_activation_min, 0); 1004 TFLITE_DCHECK_EQ(output_activation_max, 255); 1005 } 1006 BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, 1007 input1_multiplier, input1_shift, input2_data, input2_dims, 1008 input2_offset, input2_multiplier, input2_shift, output_offset, 1009 output_multiplier, output_shift, output_activation_min, 1010 output_activation_max, output_data, output_dims); 1011} 1012 1013inline void Mul(const float* input1_data, const Dims<4>& input1_dims, 1014 const float* input2_data, const Dims<4>& input2_dims, 1015 float output_activation_min, float output_activation_max, 1016 float* output_data, const Dims<4>& output_dims) { 1017 const int batches = 1018 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1019 const int height = 1020 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1021 const int width = 1022 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1023 const int depth = 1024 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1025 for (int b = 0; b < batches; ++b) { 1026 for (int y = 0; y < height; ++y) { 1027 for (int x = 0; x < width; ++x) { 1028 for (int c = 0; c < depth; ++c) { 1029 output_data[Offset(output_dims, c, x, y, b)] = 1030 ActivationFunctionWithMinMax( 1031 input1_data[Offset(input1_dims, c, x, y, b)] * 1032 input2_data[Offset(input2_dims, c, x, y, b)], 1033 output_activation_min, output_activation_max); 1034 } 1035 } 1036 } 1037 } 1038} 1039 1040// legacy, for compatibility with old checked-in code 1041template <FusedActivationFunctionType Ac> 1042void Mul(const float* input1_data, const Dims<4>& input1_dims, 1043 const float* input2_data, const Dims<4>& input2_dims, 1044 float* output_data, const Dims<4>& output_dims) { 1045 float output_activation_min, output_activation_max; 1046 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1047 1048 Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 1049 output_activation_max, output_data, output_dims); 1050} 1051 1052// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary 1053// dimensionality if the runtime code does a single loop over one dimension 1054// that handles broadcasting as the base case. The code generator would then 1055// generate max(D1, D2) nested for loops. 1056template <FusedActivationFunctionType Ac> 1057void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, 1058 const float* input2_data, const Dims<4>& input2_dims, 1059 float* output_data, const Dims<4>& output_dims) { 1060 gemmlowp::ScopedProfilingLabel label("BroadcastMul"); 1061 1062 NdArrayDesc<4> desc1; 1063 NdArrayDesc<4> desc2; 1064 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1065 1066 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1067 // col, channel), with extents (batches, height, width, depth), with the 1068 // trailing dimension changing most rapidly (channels has the smallest 1069 // stride, typically 1 element). 1070 // 1071 // In generated C code, we store arrays with the dimensions reversed. The 1072 // first dimension has smallest stride. 1073 // 1074 // We name our variables by their Tensorflow convention, but generate C code 1075 // nesting loops such that the innermost loop has the smallest stride for 1076 // the best cache behavior. 1077 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1078 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1079 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1080 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1081 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 1082 input1_data[SubscriptToIndex(desc1, c, x, y, b)] * 1083 input2_data[SubscriptToIndex(desc2, c, x, y, b)]); 1084 } 1085 } 1086 } 1087 } 1088} 1089 1090inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1091 int32 input1_offset, const uint8* input2_data, 1092 const Dims<4>& input2_dims, int32 input2_offset, 1093 int32 output_offset, int32 output_multiplier, 1094 int output_shift, int32 output_activation_min, 1095 int32 output_activation_max, uint8* output_data, 1096 const Dims<4>& output_dims) { 1097 gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); 1098 1099 NdArrayDesc<4> desc1; 1100 NdArrayDesc<4> desc2; 1101 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1102 1103 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1104 // col, channel), with extents (batches, height, width, depth), with the 1105 // trailing dimension changing most rapidly (channels has the smallest 1106 // stride, typically 1 element). 1107 // 1108 // In generated C code, we store arrays with the dimensions reversed. The 1109 // first dimension has smallest stride. 1110 // 1111 // We name our variables by their Tensorflow convention, but generate C code 1112 // nesting loops such that the innermost loop has the smallest stride for 1113 // the best cache behavior. 1114 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1115 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1116 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1117 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1118 const int32 input1_val = 1119 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 1120 const int32 input2_val = 1121 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 1122 const int32 unclamped_result = 1123 output_offset + 1124 MultiplyByQuantizedMultiplierSmallerThanOne( 1125 input1_val * input2_val, output_multiplier, output_shift); 1126 const int32 clamped_output = 1127 std::min(output_activation_max, 1128 std::max(output_activation_min, unclamped_result)); 1129 output_data[Offset(output_dims, c, x, y, b)] = 1130 static_cast<uint8>(clamped_output); 1131 } 1132 } 1133 } 1134 } 1135} 1136 1137// legacy, for compatibility with old checked-in code 1138template <FusedActivationFunctionType Ac> 1139inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1140 int32 input1_offset, const uint8* input2_data, 1141 const Dims<4>& input2_dims, int32 input2_offset, 1142 int32 output_offset, int32 output_multiplier, 1143 int output_shift, int32 output_activation_min, 1144 int32 output_activation_max, uint8* output_data, 1145 const Dims<4>& output_dims) { 1146 BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, 1147 input2_dims, input2_offset, output_offset, output_multiplier, 1148 output_shift, output_activation_min, output_activation_max, 1149 output_data, output_dims); 1150} 1151 1152template <FusedActivationFunctionType Ac, typename Scalar> 1153void Concatenation(int concat_dim, const Scalar* const* input_data, 1154 const Dims<4>* const* input_dims, int inputs_count, 1155 Scalar* output_data, const Dims<4>& output_dims) { 1156 TFLITE_DCHECK_GT(inputs_count, 1); 1157 int concat_size = 0; 1158 for (int i = 0; i < inputs_count; i++) { 1159 for (int j = 0; j < 4; j++) { 1160 if (j != concat_dim) { 1161 MatchingArraySize(*input_dims[i], j, output_dims, j); 1162 } 1163 } 1164 concat_size += ArraySize(*input_dims[i], concat_dim); 1165 } 1166 TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); 1167 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1168 int outer_size = 1; 1169 for (int i = concat_dim + 1; i < 4; i++) { 1170 outer_size *= output_dims.sizes[i]; 1171 } 1172 Scalar* output_ptr = output_data; 1173 for (int k = 0; k < outer_size; k++) { 1174 for (int i = 0; i < inputs_count; ++i) { 1175 const int copy_size = 1176 input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; 1177 memcpy(output_ptr, input_data[i] + k * copy_size, 1178 copy_size * sizeof(Scalar)); 1179 output_ptr += copy_size; 1180 } 1181 } 1182} 1183 1184template <FusedActivationFunctionType Ac, typename Scalar> 1185void DepthConcatenation(const Scalar* const* input_data, 1186 const Dims<4>* const* input_dims, int inputs_count, 1187 Scalar* output_data, const Dims<4>& output_dims) { 1188 Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, 1189 output_data, output_dims); 1190} 1191 1192inline void LstmCell(const float* input_data, const Dims<4>& input_dims, 1193 const float* prev_activ_data, 1194 const Dims<4>& prev_activ_dims, const float* weights_data, 1195 const Dims<4>& weights_dims, const float* bias_data, 1196 const Dims<4>& bias_dims, const float* prev_state_data, 1197 const Dims<4>& prev_state_dims, float* output_state_data, 1198 const Dims<4>& output_state_dims, float* output_activ_data, 1199 const Dims<4>& output_activ_dims, float* concat_temp_data, 1200 const Dims<4>& concat_temp_dims, float* activ_temp_data, 1201 const Dims<4>& activ_temp_dims) { 1202 const int batches = 1203 MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, 1204 output_state_dims, 3, output_activ_dims, 3); 1205 const int height = 1206 MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, 1207 output_state_dims, 2, output_activ_dims, 2); 1208 const int width = 1209 MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, 1210 output_state_dims, 1, output_activ_dims, 1); 1211 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 1212 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 1213 const int input_depth = ArraySize(input_dims, 0); 1214 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 1215 const int total_input_depth = prev_activ_depth + input_depth; 1216 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 1217 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1218 1); 1219 const int intern_activ_depth = 1220 MatchingArraySize(weights_dims, 1, bias_dims, 0); 1221 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 1222 const int output_depth = 1223 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 1224 output_state_dims, 0, output_activ_dims, 0); 1225 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 1226 1227 // Concatenate prev_activ and input data together 1228 std::vector<float const*> concat_input_arrays_data; 1229 std::vector<Dims<4> const*> concat_input_arrays_dims; 1230 concat_input_arrays_data.push_back(input_data); 1231 concat_input_arrays_data.push_back(prev_activ_data); 1232 concat_input_arrays_dims.push_back(&input_dims); 1233 concat_input_arrays_dims.push_back(&prev_activ_dims); 1234 Concatenation<FusedActivationFunctionType::kNone, float>( 1235 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), 1236 concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); 1237 1238 // Fully connected 1239 FullyConnected<FusedActivationFunctionType::kNone>( 1240 concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, 1241 bias_dims, activ_temp_data, activ_temp_dims); 1242 1243 // Memory state update (the LSTM "guts") 1244 for (int b = 0; b < batches; ++b) { 1245 for (int w = 0; w < width; ++w) { 1246 for (int h = 0; h < height; ++h) { 1247 for (int c = 0; c < output_depth; ++c) { 1248 const float input_gate = 1249 1.f / 1250 (1.f + std::exp(-activ_temp_data[Offset( 1251 activ_temp_dims, 0 * output_depth + c, w, h, b)])); 1252 const float new_input = std::tanh(activ_temp_data[Offset( 1253 activ_temp_dims, 1 * output_depth + c, w, h, b)]); 1254 const float forget_gate = 1255 1.f / 1256 (1.f + std::exp(-activ_temp_data[Offset( 1257 activ_temp_dims, 2 * output_depth + c, w, h, b)])); 1258 const float output_gate = 1259 1.f / 1260 (1.f + std::exp(-activ_temp_data[Offset( 1261 activ_temp_dims, 3 * output_depth + c, w, h, b)])); 1262 const float new_state = 1263 input_gate * new_input + 1264 forget_gate * 1265 prev_state_data[Offset(prev_state_dims, c, w, h, b)]; 1266 output_state_data[Offset(output_state_dims, c, w, h, b)] = new_state; 1267 output_activ_data[Offset(output_activ_dims, c, w, h, b)] = 1268 output_gate * std::tanh(new_state); 1269 } 1270 } 1271 } 1272 } 1273} 1274 1275template <FusedActivationFunctionType Ac, typename Scalar> 1276void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, 1277 int outputs_count, Scalar* const* output_data, 1278 const Dims<4>* const* output_dims) { 1279 TFLITE_DCHECK_GE(outputs_count, 1); 1280 for (int i = 0; i < outputs_count; i++) { 1281 /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); 1282 /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); 1283 /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); 1284 } 1285 const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3); 1286 const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2); 1287 const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1); 1288 // for now we dont have a model with a TensorFlowSplit 1289 // with fused activation function. 1290 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1291 for (int b = 0; b < batches; ++b) { 1292 for (int y = 0; y < height; ++y) { 1293 for (int x = 0; x < width; ++x) { 1294 int in_c = 0; 1295 for (int i = 0; i < outputs_count; ++i) { 1296 const int depth = ArraySize(*output_dims[i], 0); 1297 for (int c = 0; c < depth; ++c) { 1298 output_data[i][Offset(*output_dims[i], c, x, y, b)] = 1299 input_data[Offset(input_dims, in_c, x, y, b)]; 1300 in_c++; 1301 } 1302 } 1303 TFLITE_DCHECK(in_c == ArraySize(input_dims, 0)); 1304 } 1305 } 1306 } 1307} 1308 1309// TODO(benoitjacob) make this a proper reference impl without Eigen! 1310template <typename Scalar> 1311using MatrixMap = typename std::conditional< 1312 std::is_const<Scalar>::value, 1313 Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, 1314 Eigen::Dynamic, Eigen::Dynamic>>, 1315 Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; 1316 1317template <typename Scalar, int N> 1318MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, 1319 const Dims<N>& dims) { 1320 const int rows = dims.sizes[0]; 1321 int cols = 1; 1322 for (int d = 1; d < N; d++) { 1323 cols *= dims.sizes[d]; 1324 } 1325 return MatrixMap<Scalar>(data, rows, cols); 1326} 1327 1328template <typename Scalar, int N> 1329MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, 1330 const Dims<N>& dims) { 1331 const int cols = dims.sizes[N - 1]; 1332 int rows = 1; 1333 for (int d = 0; d < N - 1; d++) { 1334 rows *= dims.sizes[d]; 1335 } 1336 return MatrixMap<Scalar>(data, rows, cols); 1337} 1338 1339inline int NodeOffset(int b, int h, int w, int height, int width) { 1340 return (b * height + h) * width + w; 1341} 1342 1343inline void AveragePool(const float* input_data, const Dims<4>& input_dims, 1344 int stride_width, int stride_height, int pad_width, 1345 int pad_height, int filter_width, int filter_height, 1346 float output_activation_min, 1347 float output_activation_max, float* output_data, 1348 const Dims<4>& output_dims) { 1349 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1350 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1351 const int input_height = ArraySize(input_dims, 2); 1352 const int input_width = ArraySize(input_dims, 1); 1353 const int output_height = ArraySize(output_dims, 2); 1354 const int output_width = ArraySize(output_dims, 1); 1355 for (int batch = 0; batch < batches; ++batch) { 1356 for (int out_y = 0; out_y < output_height; ++out_y) { 1357 for (int out_x = 0; out_x < output_width; ++out_x) { 1358 for (int channel = 0; channel < depth; ++channel) { 1359 const int in_x_origin = (out_x * stride_width) - pad_width; 1360 const int in_y_origin = (out_y * stride_height) - pad_height; 1361 // Compute the boundaries of the filter region clamped so as to 1362 // ensure that the filter window fits in the input array. 1363 const int filter_x_start = std::max(0, -in_x_origin); 1364 const int filter_x_end = 1365 std::min(filter_width, input_width - in_x_origin); 1366 const int filter_y_start = std::max(0, -in_y_origin); 1367 const int filter_y_end = 1368 std::min(filter_height, input_height - in_y_origin); 1369 float total = 0.f; 1370 float filter_count = 0; 1371 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1372 ++filter_y) { 1373 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1374 ++filter_x) { 1375 const int in_x = in_x_origin + filter_x; 1376 const int in_y = in_y_origin + filter_y; 1377 total += 1378 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1379 filter_count++; 1380 } 1381 } 1382 const float average = total / filter_count; 1383 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1384 ActivationFunctionWithMinMax(average, output_activation_min, 1385 output_activation_max); 1386 } 1387 } 1388 } 1389 } 1390} 1391 1392// legacy, for compatibility with old checked-in code 1393template <FusedActivationFunctionType Ac> 1394void AveragePool(const float* input_data, const Dims<4>& input_dims, 1395 int stride_width, int stride_height, int pad_width, 1396 int pad_height, int filter_width, int filter_height, 1397 float* output_data, const Dims<4>& output_dims) { 1398 float output_activation_min, output_activation_max; 1399 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1400 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1401 pad_height, filter_width, filter_height, output_activation_min, 1402 output_activation_max, output_data, output_dims); 1403} 1404 1405// legacy, for compatibility with old checked-in code 1406template <FusedActivationFunctionType Ac> 1407void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, 1408 int pad_width, int pad_height, int filter_width, 1409 int filter_height, float* output_data, 1410 const Dims<4>& output_dims) { 1411 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1412 filter_width, filter_height, output_data, output_dims); 1413} 1414 1415inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1416 int stride_width, int stride_height, int pad_width, 1417 int pad_height, int filter_width, int filter_height, 1418 int32 output_activation_min, 1419 int32 output_activation_max, uint8* output_data, 1420 const Dims<4>& output_dims) { 1421 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1422 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1423 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1424 const int input_height = ArraySize(input_dims, 2); 1425 const int input_width = ArraySize(input_dims, 1); 1426 const int output_height = ArraySize(output_dims, 2); 1427 const int output_width = ArraySize(output_dims, 1); 1428 for (int batch = 0; batch < batches; ++batch) { 1429 for (int out_y = 0; out_y < output_height; ++out_y) { 1430 for (int out_x = 0; out_x < output_width; ++out_x) { 1431 for (int channel = 0; channel < depth; ++channel) { 1432 const int in_x_origin = (out_x * stride_width) - pad_width; 1433 const int in_y_origin = (out_y * stride_height) - pad_height; 1434 // Compute the boundaries of the filter region clamped so as to 1435 // ensure that the filter window fits in the input array. 1436 const int filter_x_start = std::max(0, -in_x_origin); 1437 const int filter_x_end = 1438 std::min(filter_width, input_width - in_x_origin); 1439 const int filter_y_start = std::max(0, -in_y_origin); 1440 const int filter_y_end = 1441 std::min(filter_height, input_height - in_y_origin); 1442 int32 acc = 0; 1443 int filter_count = 0; 1444 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1445 ++filter_y) { 1446 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1447 ++filter_x) { 1448 const int in_x = in_x_origin + filter_x; 1449 const int in_y = in_y_origin + filter_y; 1450 acc += input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1451 filter_count++; 1452 } 1453 } 1454 acc = (acc + filter_count / 2) / filter_count; 1455 acc = std::max(acc, output_activation_min); 1456 acc = std::min(acc, output_activation_max); 1457 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1458 static_cast<uint8>(acc); 1459 } 1460 } 1461 } 1462 } 1463} 1464 1465// legacy, for compatibility with old checked-in code 1466template <FusedActivationFunctionType Ac> 1467void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 1468 int stride_width, int stride_height, int pad_width, 1469 int pad_height, int filter_width, int filter_height, 1470 int32 output_activation_min, int32 output_activation_max, 1471 uint8* output_data, const Dims<4>& output_dims) { 1472 static_assert(Ac == FusedActivationFunctionType::kNone || 1473 Ac == FusedActivationFunctionType::kRelu || 1474 Ac == FusedActivationFunctionType::kRelu6 || 1475 Ac == FusedActivationFunctionType::kRelu1, 1476 ""); 1477 if (Ac == FusedActivationFunctionType::kNone) { 1478 TFLITE_DCHECK_EQ(output_activation_min, 0); 1479 TFLITE_DCHECK_EQ(output_activation_max, 255); 1480 } 1481 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 1482 pad_height, filter_width, filter_height, output_activation_min, 1483 output_activation_max, output_data, output_dims); 1484} 1485 1486// legacy, for compatibility with old checked-in code 1487template <FusedActivationFunctionType Ac> 1488void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, 1489 int pad_width, int pad_height, int filter_width, 1490 int filter_height, int32 output_activation_min, 1491 int32 output_activation_max, uint8* output_data, 1492 const Dims<4>& output_dims) { 1493 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1494 filter_width, filter_height, output_activation_min, 1495 output_activation_max, output_data, output_dims); 1496} 1497 1498inline void L2Pool(const float* input_data, const Dims<4>& input_dims, 1499 int stride_width, int stride_height, int pad_width, 1500 int pad_height, int filter_width, int filter_height, 1501 float output_activation_min, float output_activation_max, 1502 float* output_data, const Dims<4>& output_dims) { 1503 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1504 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1505 const int input_height = ArraySize(input_dims, 2); 1506 const int input_width = ArraySize(input_dims, 1); 1507 const int output_height = ArraySize(output_dims, 2); 1508 const int output_width = ArraySize(output_dims, 1); 1509 for (int batch = 0; batch < batches; ++batch) { 1510 for (int out_y = 0; out_y < output_height; ++out_y) { 1511 for (int out_x = 0; out_x < output_width; ++out_x) { 1512 for (int channel = 0; channel < depth; ++channel) { 1513 const int in_x_origin = (out_x * stride_width) - pad_width; 1514 const int in_y_origin = (out_y * stride_height) - pad_height; 1515 // Compute the boundaries of the filter region clamped so as to 1516 // ensure that the filter window fits in the input array. 1517 const int filter_x_start = std::max(0, -in_x_origin); 1518 const int filter_x_end = 1519 std::min(filter_width, input_width - in_x_origin); 1520 const int filter_y_start = std::max(0, -in_y_origin); 1521 const int filter_y_end = 1522 std::min(filter_height, input_height - in_y_origin); 1523 float sum_squares = 0.f; 1524 int filter_count = 0; 1525 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1526 ++filter_y) { 1527 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1528 ++filter_x) { 1529 const int in_x = in_x_origin + filter_x; 1530 const int in_y = in_y_origin + filter_y; 1531 const float val = 1532 input_data[Offset(input_dims, channel, in_x, in_y, batch)]; 1533 sum_squares += val * val; 1534 filter_count++; 1535 } 1536 } 1537 const float l2pool_result = std::sqrt(sum_squares / filter_count); 1538 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1539 ActivationFunctionWithMinMax(l2pool_result, output_activation_min, 1540 output_activation_max); 1541 } 1542 } 1543 } 1544 } 1545} 1546 1547// legacy, for compatibility with old checked-in code 1548template <FusedActivationFunctionType Ac> 1549void L2Pool(const float* input_data, const Dims<4>& input_dims, 1550 int stride_width, int stride_height, int pad_width, int pad_height, 1551 int filter_width, int filter_height, float* output_data, 1552 const Dims<4>& output_dims) { 1553 float output_activation_min, output_activation_max; 1554 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1555 1556 L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, 1557 pad_height, filter_width, filter_height, output_activation_min, 1558 output_activation_max, output_data, output_dims); 1559} 1560 1561// legacy, for compatibility with old checked-in code 1562template <FusedActivationFunctionType Ac> 1563void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, 1564 int pad_width, int pad_height, int filter_width, int filter_height, 1565 float* output_data, const Dims<4>& output_dims) { 1566 L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1567 filter_width, filter_height, output_data, output_dims); 1568} 1569 1570inline void MaxPool(const float* input_data, const Dims<4>& input_dims, 1571 int stride_width, int stride_height, int pad_width, 1572 int pad_height, int filter_width, int filter_height, 1573 float output_activation_min, float output_activation_max, 1574 float* output_data, const Dims<4>& output_dims) { 1575 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1576 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1577 const int input_height = ArraySize(input_dims, 2); 1578 const int input_width = ArraySize(input_dims, 1); 1579 const int output_height = ArraySize(output_dims, 2); 1580 const int output_width = ArraySize(output_dims, 1); 1581 for (int batch = 0; batch < batches; ++batch) { 1582 for (int out_y = 0; out_y < output_height; ++out_y) { 1583 for (int out_x = 0; out_x < output_width; ++out_x) { 1584 for (int channel = 0; channel < depth; ++channel) { 1585 const int in_x_origin = (out_x * stride_width) - pad_width; 1586 const int in_y_origin = (out_y * stride_height) - pad_height; 1587 // Compute the boundaries of the filter region clamped so as to 1588 // ensure that the filter window fits in the input array. 1589 const int filter_x_start = std::max(0, -in_x_origin); 1590 const int filter_x_end = 1591 std::min(filter_width, input_width - in_x_origin); 1592 const int filter_y_start = std::max(0, -in_y_origin); 1593 const int filter_y_end = 1594 std::min(filter_height, input_height - in_y_origin); 1595 float max = std::numeric_limits<float>::lowest(); 1596 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1597 ++filter_y) { 1598 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1599 ++filter_x) { 1600 const int in_x = in_x_origin + filter_x; 1601 const int in_y = in_y_origin + filter_y; 1602 max = std::max( 1603 max, 1604 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 1605 } 1606 } 1607 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1608 ActivationFunctionWithMinMax(max, output_activation_min, 1609 output_activation_max); 1610 } 1611 } 1612 } 1613 } 1614} 1615 1616// legacy, for compatibility with old checked-in code 1617template <FusedActivationFunctionType Ac> 1618void MaxPool(const float* input_data, const Dims<4>& input_dims, 1619 int stride_width, int stride_height, int pad_width, int pad_height, 1620 int filter_width, int filter_height, float* output_data, 1621 const Dims<4>& output_dims) { 1622 float output_activation_min, output_activation_max; 1623 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1624 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 1625 pad_height, filter_width, filter_height, output_activation_min, 1626 output_activation_max, output_data, output_dims); 1627} 1628 1629// legacy, for compatibility with old checked-in code 1630template <FusedActivationFunctionType Ac> 1631void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, 1632 int pad_width, int pad_height, int filter_width, int filter_height, 1633 float* output_data, const Dims<4>& output_dims) { 1634 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1635 filter_width, filter_height, output_data, output_dims); 1636} 1637 1638inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 1639 int stride_width, int stride_height, int pad_width, 1640 int pad_height, int filter_width, int filter_height, 1641 int32 output_activation_min, int32 output_activation_max, 1642 uint8* output_data, const Dims<4>& output_dims) { 1643 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1644 TFLITE_DCHECK_GE(output_activation_min, 0); 1645 TFLITE_DCHECK_LE(output_activation_max, 255); 1646 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1647 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1648 const int input_height = ArraySize(input_dims, 2); 1649 const int input_width = ArraySize(input_dims, 1); 1650 const int output_height = ArraySize(output_dims, 2); 1651 const int output_width = ArraySize(output_dims, 1); 1652 for (int batch = 0; batch < batches; ++batch) { 1653 for (int out_y = 0; out_y < output_height; ++out_y) { 1654 for (int out_x = 0; out_x < output_width; ++out_x) { 1655 for (int channel = 0; channel < depth; ++channel) { 1656 const int in_x_origin = (out_x * stride_width) - pad_width; 1657 const int in_y_origin = (out_y * stride_height) - pad_height; 1658 // Compute the boundaries of the filter region clamped so as to 1659 // ensure that the filter window fits in the input array. 1660 const int filter_x_start = std::max(0, -in_x_origin); 1661 const int filter_x_end = 1662 std::min(filter_width, input_width - in_x_origin); 1663 const int filter_y_start = std::max(0, -in_y_origin); 1664 const int filter_y_end = 1665 std::min(filter_height, input_height - in_y_origin); 1666 uint8 max = 0; 1667 for (int filter_y = filter_y_start; filter_y < filter_y_end; 1668 ++filter_y) { 1669 for (int filter_x = filter_x_start; filter_x < filter_x_end; 1670 ++filter_x) { 1671 const int in_x = in_x_origin + filter_x; 1672 const int in_y = in_y_origin + filter_y; 1673 max = std::max( 1674 max, 1675 input_data[Offset(input_dims, channel, in_x, in_y, batch)]); 1676 } 1677 } 1678 max = std::max<uint8>(max, output_activation_min); 1679 max = std::min<uint8>(max, output_activation_max); 1680 output_data[Offset(output_dims, channel, out_x, out_y, batch)] = 1681 static_cast<uint8>(max); 1682 } 1683 } 1684 } 1685 } 1686} 1687 1688// legacy, for compatibility with old checked-in code 1689template <FusedActivationFunctionType Ac> 1690void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 1691 int stride_width, int stride_height, int pad_width, int pad_height, 1692 int filter_width, int filter_height, int32 output_activation_min, 1693 int32 output_activation_max, uint8* output_data, 1694 const Dims<4>& output_dims) { 1695 static_assert(Ac == FusedActivationFunctionType::kNone || 1696 Ac == FusedActivationFunctionType::kRelu || 1697 Ac == FusedActivationFunctionType::kRelu6 || 1698 Ac == FusedActivationFunctionType::kRelu1, 1699 ""); 1700 if (Ac == FusedActivationFunctionType::kNone) { 1701 TFLITE_DCHECK_EQ(output_activation_min, 0); 1702 TFLITE_DCHECK_EQ(output_activation_max, 255); 1703 } 1704 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 1705 pad_height, filter_width, filter_height, output_activation_min, 1706 output_activation_max, output_data, output_dims); 1707} 1708 1709// legacy, for compatibility with old checked-in code 1710template <FusedActivationFunctionType Ac> 1711void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, 1712 int pad_width, int pad_height, int filter_width, int filter_height, 1713 int32 output_activation_min, int32 output_activation_max, 1714 uint8* output_data, const Dims<4>& output_dims) { 1715 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 1716 filter_width, filter_height, output_activation_min, 1717 output_activation_max, output_data, output_dims); 1718} 1719 1720inline void LocalResponseNormalization(const float* input_data, 1721 const Dims<4>& input_dims, int range, 1722 float bias, float alpha, float beta, 1723 float* output_data, 1724 const Dims<4>& output_dims) { 1725 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1726 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1727 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1728 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1729 1730 for (int b = 0; b < batches; ++b) { 1731 for (int y = 0; y < height; ++y) { 1732 for (int x = 0; x < width; ++x) { 1733 for (int c = 0; c < depth; ++c) { 1734 const int begin_input_c = std::max(0, c - range); 1735 const int end_input_c = std::min(depth, c + range); 1736 float accum = 0.f; 1737 for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) { 1738 const float input_val = 1739 input_data[Offset(input_dims, input_c, x, y, b)]; 1740 accum += input_val * input_val; 1741 } 1742 const float multiplier = std::pow(bias + alpha * accum, -beta); 1743 output_data[Offset(output_dims, c, x, y, b)] = 1744 input_data[Offset(input_dims, c, x, y, b)] * multiplier; 1745 } 1746 } 1747 } 1748 } 1749} 1750 1751inline void Softmax(const float* input_data, const Dims<4>& input_dims, 1752 float beta, float* output_data, 1753 const Dims<4>& output_dims) { 1754 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1755 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1756 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1757 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1758 1759 for (int b = 0; b < batches; ++b) { 1760 for (int y = 0; y < height; ++y) { 1761 for (int x = 0; x < width; ++x) { 1762 // Find max element value which we'll use to ensure numerical stability 1763 // taking advantage of the following equality: 1764 // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) 1765 float max = std::numeric_limits<float>::lowest(); 1766 for (int c = 0; c < depth; ++c) { 1767 max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]); 1768 } 1769 1770 // Compute sum. 1771 float sum = 0.f; 1772 for (int c = 0; c < depth; ++c) { 1773 sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 1774 beta); 1775 } 1776 1777 // Compute result. 1778 for (int c = 0; c < depth; ++c) { 1779 output_data[Offset(output_dims, c, x, y, b)] = 1780 std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) * 1781 beta) / 1782 sum; 1783 } 1784 } 1785 } 1786 } 1787} 1788 1789inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, 1790 int32 input_beta_multiplier, int32 input_beta_left_shift, 1791 int diff_min, uint8* output_data, 1792 const Dims<4>& output_dims) { 1793 // The representation chosen for the input to the exp() function is Q5.26. 1794 // We need to leave extra space since values that we skip might be as large as 1795 // -32 before multiplying by input_beta_multiplier, and therefore as large as 1796 // -16 afterwards. Note that exp(-8) is definitely not insignificant to 1797 // accumulation, but exp(-16) definitely is. 1798 static const int kScaledDiffIntegerBits = 5; 1799 static const int kAccumulationIntegerBits = 12; 1800 using FixedPointScaledDiff = 1801 gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; 1802 using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; 1803 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 1804 1805 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1806 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1807 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1808 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1809 1810 for (int b = 0; b < batches; ++b) { 1811 for (int x = 0; x < width; ++x) { 1812 for (int y = 0; y < height; ++y) { 1813 uint8 max_in_row = 0; 1814 for (int c = 0; c < depth; ++c) { 1815 max_in_row = 1816 std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]); 1817 } 1818 1819 FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); 1820 for (int c = 0; c < depth; ++c) { 1821 int32 input_diff = 1822 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 1823 max_in_row; 1824 if (input_diff >= diff_min) { 1825 const int32 input_diff_rescaled = 1826 MultiplyByQuantizedMultiplierGreaterThanOne( 1827 input_diff, input_beta_multiplier, input_beta_left_shift); 1828 const FixedPointScaledDiff scaled_diff_f8 = 1829 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 1830 sum_of_exps = 1831 sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( 1832 exp_on_negative_values(scaled_diff_f8)); 1833 } 1834 } 1835 1836 int32 fixed_sum_of_exps = sum_of_exps.raw(); 1837 int headroom_plus_one = 1838 CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps)); 1839 // This is the number of bits to the left of the binary point above 1.0. 1840 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and 1841 // no later adjustment will be needed. 1842 int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; 1843 int32 shifted_sum_minus_one = static_cast<int32>( 1844 (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - 1845 (static_cast<uint32>(1) << 31)); 1846 1847 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( 1848 FixedPoint0::FromRaw(shifted_sum_minus_one)); 1849 1850 for (int c = 0; c < depth; ++c) { 1851 int32 input_diff = 1852 static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - 1853 max_in_row; 1854 if (input_diff >= diff_min) { 1855 const int32 input_diff_rescaled = 1856 MultiplyByQuantizedMultiplierGreaterThanOne( 1857 input_diff, input_beta_multiplier, input_beta_left_shift); 1858 const FixedPointScaledDiff scaled_diff_f8 = 1859 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 1860 1861 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); 1862 int32 unsat_output = gemmlowp::RoundingDivideByPOT( 1863 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); 1864 1865 output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>( 1866 std::max(std::min(unsat_output, static_cast<int32>(255)), 0)); 1867 1868 } else { 1869 output_data[Offset(output_dims, c, x, y, b)] = 0; 1870 } 1871 } 1872 } 1873 } 1874 } 1875} 1876 1877inline void Logistic(const float* input_data, const Dims<4>& input_dims, 1878 float* output_data, const Dims<4>& output_dims) { 1879 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1880 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1881 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1882 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1883 for (int b = 0; b < batches; ++b) { 1884 for (int y = 0; y < height; ++y) { 1885 for (int x = 0; x < width; ++x) { 1886 for (int c = 0; c < depth; ++c) { 1887 float val = input_data[Offset(input_dims, c, x, y, b)]; 1888 float result = 1.f / (1.f + std::exp(-val)); 1889 output_data[Offset(output_dims, c, x, y, b)] = result; 1890 } 1891 } 1892 } 1893 } 1894} 1895 1896inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, 1897 int32 input_zero_point, int32 input_range_radius, 1898 int32 input_multiplier, int input_left_shift, 1899 uint8* output_data, const Dims<4>& output_dims) { 1900 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1901 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1902 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1903 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1904 for (int b = 0; b < batches; ++b) { 1905 for (int y = 0; y < height; ++y) { 1906 for (int x = 0; x < width; ++x) { 1907 for (int c = 0; c < depth; ++c) { 1908 const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; 1909 const int32 input_val_centered = 1910 static_cast<int32>(input_val_u8) - input_zero_point; 1911 uint8 output_val; 1912 if (input_val_centered <= -input_range_radius) { 1913 output_val = 0; 1914 } else if (input_val_centered >= input_range_radius) { 1915 output_val = 255; 1916 } else { 1917 const int32 input_val_rescaled = 1918 MultiplyByQuantizedMultiplierGreaterThanOne( 1919 input_val_centered, input_multiplier, input_left_shift); 1920 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 1921 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 1922 const FixedPoint4 input_val_f4 = 1923 FixedPoint4::FromRaw(input_val_rescaled); 1924 const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); 1925 using gemmlowp::RoundingDivideByPOT; 1926 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); 1927 if (output_val_s32 == 256) { 1928 output_val_s32 = 255; 1929 } 1930 TFLITE_DCHECK_GE(output_val_s32, 0); 1931 TFLITE_DCHECK_LE(output_val_s32, 255); 1932 output_val = static_cast<uint8>(output_val_s32); 1933 } 1934 output_data[Offset(output_dims, c, x, y, b)] = output_val; 1935 } 1936 } 1937 } 1938 } 1939} 1940 1941inline void Tanh(const float* input_data, const Dims<4>& input_dims, 1942 float* output_data, const Dims<4>& output_dims) { 1943 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1944 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1945 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1946 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1947 for (int b = 0; b < batches; ++b) { 1948 for (int y = 0; y < height; ++y) { 1949 for (int x = 0; x < width; ++x) { 1950 for (int c = 0; c < depth; ++c) { 1951 float val = input_data[Offset(input_dims, c, x, y, b)]; 1952 float result = std::tanh(val); 1953 output_data[Offset(output_dims, c, x, y, b)] = result; 1954 } 1955 } 1956 } 1957 } 1958} 1959 1960inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, 1961 int32 zero_point, double scale, float* output_data, 1962 const Dims<4>& output_dims) { 1963 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1964 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1965 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1966 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1967 for (int b = 0; b < batches; ++b) { 1968 for (int y = 0; y < height; ++y) { 1969 for (int x = 0; x < width; ++x) { 1970 for (int c = 0; c < depth; ++c) { 1971 int32 val = input_data[Offset(input_dims, c, x, y, b)]; 1972 float result = static_cast<float>(scale * (val - zero_point)); 1973 output_data[Offset(output_dims, c, x, y, b)] = result; 1974 } 1975 } 1976 } 1977 } 1978} 1979 1980inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, 1981 float rmin, float rmax, float* output_data, 1982 const Dims<4>& output_dims) { 1983 // 0 should always be a representable value. Let's assume that the initial 1984 // min,max range contains 0. 1985 TFLITE_DCHECK_LE(rmin, 0.); 1986 TFLITE_DCHECK_GE(rmax, 0.); 1987 1988 // Determine quantization parameters: zero_point, scale. 1989 using Integer = uint8; 1990 const Integer qmin = std::numeric_limits<Integer>::min(); 1991 const Integer qmax = std::numeric_limits<Integer>::max(); 1992 const float qmin_float = qmin; 1993 const float qmax_float = qmax; 1994 int32 zero_point = 0; 1995 float scale = 0.f; 1996 // If rmin==rmax, both must be zero per the above assertion, 1997 // so we are done. 1998 if (rmin != rmax) { 1999 // First determine the scale. 2000 scale = (rmax - rmin) / (qmax_float - qmin_float); 2001 2002 // Zero-point computation. 2003 // First the initial floating-point computation. The zero-point can be 2004 // determined from solving an affine equation for any known pair 2005 // (real value, corresponding quantized value). 2006 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 2007 // The arithmetic error on the zero point computed from either pair 2008 // will be roughly machine_epsilon * (sum of absolute values of terms) 2009 // so we want to use the variant that adds the smaller terms. 2010 const float zero_point_from_min = qmin_float - rmin / scale; 2011 const float zero_point_from_max = qmax_float - rmax / scale; 2012 const float zero_point_from_min_error = 2013 std::abs(qmin_float) + std::abs(rmin / scale); 2014 const float zero_point_from_max_error = 2015 std::abs(qmax_float) + std::abs(rmax / scale); 2016 2017 const float zero_point_float = 2018 zero_point_from_min_error < zero_point_from_max_error 2019 ? zero_point_from_min 2020 : zero_point_from_max; 2021 2022 // Now we need to nudge the zero point to be an integer 2023 // (our zero points are integer, and this is motivated by the requirement 2024 // to be able to represent the real value "0" exactly as a quantized value, 2025 // which is required in multiple places, for example in Im2col with SAME 2026 // padding). 2027 if (zero_point_float < qmin_float) { 2028 zero_point = qmin; 2029 } else if (zero_point_float > qmax_float) { 2030 zero_point = qmax; 2031 } else { 2032 zero_point = static_cast<int32>(TfLiteRound(zero_point_float)); 2033 } 2034 // The zero point should always be in the range of quantized value, 2035 // [qmin, qmax]. 2036 TFLITE_DCHECK_GE(zero_point, qmin); 2037 TFLITE_DCHECK_LE(zero_point, qmax); 2038 } 2039 2040 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2041 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2042 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2043 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2044 for (int b = 0; b < batches; ++b) { 2045 for (int y = 0; y < height; ++y) { 2046 for (int x = 0; x < width; ++x) { 2047 for (int c = 0; c < depth; ++c) { 2048 const float src_val = input_data[Offset(input_dims, c, x, y, b)]; 2049 const float unclamped_quantized_val = 2050 TfLiteRound(zero_point + src_val / scale); 2051 const float quantized_val = std::min( 2052 qmax_float, std::max(qmin_float, unclamped_quantized_val)); 2053 const float dst_val = scale * (quantized_val - zero_point); 2054 output_data[Offset(output_dims, c, x, y, b)] = dst_val; 2055 } 2056 } 2057 } 2058 } 2059} 2060 2061template <typename SrcT, typename DstT> 2062inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, 2063 DstT* output_data, const Dims<4>& output_dims) { 2064 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2065 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2066 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2067 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2068 for (int b = 0; b < batches; ++b) { 2069 for (int y = 0; y < height; ++y) { 2070 for (int x = 0; x < width; ++x) { 2071 for (int c = 0; c < depth; ++c) { 2072 int offset = Offset(input_dims, c, x, y, b); 2073 output_data[offset] = static_cast<DstT>(input_data[offset]); 2074 } 2075 } 2076 } 2077 } 2078} 2079 2080inline void Floor(const float* input_data, const Dims<4>& input_dims, 2081 float* output_data, const Dims<4>& output_dims) { 2082 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2083 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2084 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2085 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2086 for (int b = 0; b < batches; ++b) { 2087 for (int y = 0; y < height; ++y) { 2088 for (int x = 0; x < width; ++x) { 2089 for (int c = 0; c < depth; ++c) { 2090 int offset = Offset(input_dims, c, x, y, b); 2091 output_data[offset] = std::floor(input_data[offset]); 2092 } 2093 } 2094 } 2095 } 2096} 2097 2098template <typename T> 2099inline void Gather(const T* input_data, const Dims<4>& input_dims, 2100 int input_rank, const int32* coords_data, 2101 const Dims<4>& coords_dims, T* output_data, 2102 const Dims<4>& output_dims) { 2103 TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); 2104 int stride = input_dims.strides[input_rank - 1]; 2105 T* out = output_data; 2106 2107 for (int i = 0; i < coords_dims.sizes[0]; i++) { 2108 TFLITE_DCHECK_GE(coords_data[i], 0); 2109 TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); 2110 const T* in = input_data + coords_data[i] * stride; 2111 memcpy(out, in, sizeof(T) * stride); 2112 out += stride; 2113 } 2114} 2115 2116inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 2117 const int32* output_size_data, 2118 const Dims<4>& output_size_dims, float* output_data, 2119 const Dims<4>& output_dims) { 2120 int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2121 int32 input_height = ArraySize(input_dims, 2); 2122 int32 input_width = ArraySize(input_dims, 1); 2123 int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2124 2125 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); 2126 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); 2127 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); 2128 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); 2129 int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; 2130 int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; 2131 float height_scale = static_cast<float>(input_height) / output_height; 2132 float width_scale = static_cast<float>(input_width) / output_width; 2133 2134 for (int b = 0; b < batches; ++b) { 2135 for (int y = 0; y < output_height; ++y) { 2136 float input_y = y * height_scale; 2137 int32 y0 = static_cast<int32>(std::floor(input_y)); 2138 int32 y1 = std::min(y0 + 1, input_height - 1); 2139 for (int x = 0; x < output_width; ++x) { 2140 float input_x = x * width_scale; 2141 int32 x0 = static_cast<int32>(std::floor(input_x)); 2142 int32 x1 = std::min(x0 + 1, input_width - 1); 2143 for (int c = 0; c < depth; ++c) { 2144 float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * 2145 (1 - (input_y - y0)) * 2146 (1 - (input_x - x0)) + 2147 input_data[Offset(input_dims, c, x0, y1, b)] * 2148 (input_y - y0) * (1 - (input_x - x0)) + 2149 input_data[Offset(input_dims, c, x1, y0, b)] * 2150 (1 - (input_y - y0)) * (input_x - x0) + 2151 input_data[Offset(input_dims, c, x1, y1, b)] * 2152 (input_y - y0) * (input_x - x0); 2153 output_data[Offset(output_dims, c, x, y, b)] = interpolation; 2154 } 2155 } 2156 } 2157 } 2158} 2159 2160template <typename T> 2161inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, 2162 const int32* block_shape_data, 2163 const Dims<4>& block_shape_dims, 2164 const int32* paddings_data, 2165 const Dims<4>& paddings_dims, T* output_data, 2166 const Dims<4>& output_dims) { 2167 const int output_batch_size = ArraySize(output_dims, 3); 2168 const int output_height = ArraySize(output_dims, 2); 2169 const int output_width = ArraySize(output_dims, 1); 2170 const int input_batch_size = ArraySize(input_dims, 3); 2171 const int input_height = ArraySize(input_dims, 2); 2172 const int input_width = ArraySize(input_dims, 1); 2173 const int depth = ArraySize(input_dims, 0); 2174 const int block_shape_height = block_shape_data[0]; 2175 const int block_shape_width = block_shape_data[1]; 2176 const int padding_top = paddings_data[0]; 2177 const int padding_left = paddings_data[2]; 2178 2179 for (int out_b = 0; out_b < output_batch_size; ++out_b) { 2180 int input_batch = out_b % input_batch_size; 2181 int shift_w = (out_b / input_batch_size) % block_shape_width; 2182 int shift_h = (out_b / input_batch_size) / block_shape_width; 2183 for (int out_h = 0; out_h < output_height; ++out_h) { 2184 for (int out_w = 0; out_w < output_width; ++out_w) { 2185 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); 2186 if (out_h * block_shape_height < padding_top || 2187 out_h * block_shape_height >= padding_top + input_height || 2188 out_w * block_shape_width < padding_left || 2189 out_w * block_shape_width >= padding_left + input_width) { 2190 memset(out, 0, depth * sizeof(T)); 2191 } else { 2192 const T* in = 2193 input_data + 2194 Offset(input_dims, 0, 2195 (out_w * block_shape_width + shift_w) - padding_left, 2196 (out_h * block_shape_height + shift_h) - padding_top, 2197 input_batch); 2198 memcpy(out, in, depth * sizeof(T)); 2199 } 2200 } 2201 } 2202 } 2203} 2204 2205template <typename T> 2206inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, 2207 const int32* block_shape_data, 2208 const Dims<4>& block_shape_dims, T* output_data, 2209 const Dims<4>& output_dims) { 2210 const int output_batch_size = ArraySize(output_dims, 3); 2211 const int input_batch_size = ArraySize(input_dims, 3); 2212 const int input_height = ArraySize(input_dims, 2); 2213 const int input_width = ArraySize(input_dims, 1); 2214 const int depth = ArraySize(input_dims, 0); 2215 const int block_shape_width = block_shape_data[1]; 2216 const int block_shape_height = block_shape_data[0]; 2217 2218 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) { 2219 for (int in_h = 0; in_h < input_height; ++in_h) { 2220 for (int in_w = 0; in_w < input_width; ++in_w) { 2221 int out_batch = in_batch % output_batch_size; 2222 int out_w = in_w * block_shape_width + 2223 (in_batch / output_batch_size) % block_shape_width; 2224 int out_h = in_h * block_shape_height + 2225 (in_batch / output_batch_size) / block_shape_width; 2226 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); 2227 const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); 2228 memcpy(out, in, depth * sizeof(T)); 2229 } 2230 } 2231 } 2232} 2233 2234template <typename T> 2235inline void Pad(const T* input_data, const Dims<4>& input_dims, 2236 const std::vector<int>& left_paddings, 2237 const std::vector<int>& right_paddings, T* output_data, 2238 const Dims<4>& output_dims) { 2239 const int output_batch = ArraySize(output_dims, 3); 2240 const int output_height = ArraySize(output_dims, 2); 2241 const int output_width = ArraySize(output_dims, 1); 2242 const int output_depth = ArraySize(output_dims, 0); 2243 2244 const int left_b_padding = left_paddings[3]; 2245 const int left_h_padding = left_paddings[2]; 2246 const int left_w_padding = left_paddings[1]; 2247 const int left_d_padding = left_paddings[0]; 2248 2249 const int right_b_padding = right_paddings[3]; 2250 const int right_h_padding = right_paddings[2]; 2251 const int right_w_padding = right_paddings[1]; 2252 const int right_d_padding = right_paddings[0]; 2253 2254 const T* in_ptr = input_data; 2255 T* out_ptr = output_data; 2256 for (int out_b = 0; out_b < output_batch; ++out_b) { 2257 for (int out_h = 0; out_h < output_height; ++out_h) { 2258 for (int out_w = 0; out_w < output_width; ++out_w) { 2259 for (int out_d = 0; out_d < output_depth; ++out_d) { 2260 if (out_b < left_b_padding || 2261 out_b >= output_batch - right_b_padding || 2262 out_h < left_h_padding || 2263 out_h >= output_height - right_h_padding || 2264 out_w < left_w_padding || 2265 out_w >= output_width - right_w_padding || 2266 out_d < left_d_padding || 2267 out_d >= output_depth - right_d_padding) { 2268 *out_ptr++ = 0; 2269 } else { 2270 *out_ptr++ = *in_ptr++; 2271 } 2272 } 2273 } 2274 } 2275 } 2276} 2277 2278template <typename T> 2279inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, 2280 int begin_mask, int end_mask, 2281 const std::vector<int>& starts, 2282 const std::vector<int>& stops, 2283 const std::vector<int>& strides, T* output_data, 2284 const Dims<4>& output_dims) { 2285 const int start_b = (begin_mask & 8) ? 0 : starts[3]; 2286 const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3]; 2287 const int start_h = (begin_mask & 4) ? 0 : starts[2]; 2288 const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2]; 2289 const int start_w = (begin_mask & 2) ? 0 : starts[1]; 2290 const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1]; 2291 const int start_d = (begin_mask & 1) ? 0 : starts[0]; 2292 const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0]; 2293 2294 T* out_ptr = output_data; 2295 for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) { 2296 for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) { 2297 for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) { 2298 for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) { 2299 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2300 } 2301 } 2302 } 2303 } 2304} 2305 2306template <typename T> 2307inline void Slice(const T* input_data, const Dims<4>& input_dims, 2308 const std::vector<int>& begin, const std::vector<int>& size, 2309 T* output_data, const Dims<4>& output_dims) { 2310 // TODO(dkalenichenko): This op only supports 4D tensors. 2311 TFLITE_DCHECK_EQ(begin.size(), 4); 2312 TFLITE_DCHECK_EQ(size.size(), 4); 2313 const int start_b = begin[3]; 2314 const int stop_b = 2315 size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; 2316 const int start_h = begin[2]; 2317 const int stop_h = 2318 size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2]; 2319 const int start_w = begin[1]; 2320 const int stop_w = 2321 size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1]; 2322 const int start_d = begin[0]; 2323 const int stop_d = 2324 size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; 2325 2326 T* out_ptr = output_data; 2327 for (int in_b = start_b; in_b < stop_b; ++in_b) { 2328 for (int in_h = start_h; in_h < stop_h; ++in_h) { 2329 for (int in_w = start_w; in_w < stop_w; ++in_w) { 2330 for (int in_d = start_d; in_d < stop_d; ++in_d) { 2331 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 2332 } 2333 } 2334 } 2335 } 2336} 2337 2338template <typename T> 2339inline void Mean(const T* input_data, const Dims<4>& input_dims, 2340 const std::vector<int>& reduction_indices, T* output_data, 2341 const Dims<4>& output_dims) { 2342 const int output_batch = ArraySize(output_dims, 3); 2343 const int output_height = ArraySize(output_dims, 2); 2344 const int output_width = ArraySize(output_dims, 1); 2345 const int output_depth = ArraySize(output_dims, 0); 2346 2347 const int input_height = ArraySize(input_dims, 2); 2348 const int input_width = ArraySize(input_dims, 1); 2349 2350 // The current implementation only supports simultaneous reduction over 2351 // width and height. 2352 TFLITE_DCHECK_EQ(reduction_indices.size(), 2); 2353 TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || 2354 (reduction_indices[0] == 2 && reduction_indices[1] == 1)); 2355 TFLITE_DCHECK_EQ(output_height, 1); 2356 TFLITE_DCHECK_EQ(output_width, 1); 2357 2358 for (int out_b = 0; out_b < output_batch; ++out_b) { 2359 for (int out_d = 0; out_d < output_depth; ++out_d) { 2360 float value = 0; 2361 for (int in_h = 0; in_h < input_height; ++in_h) { 2362 for (int in_w = 0; in_w < input_width; ++in_w) { 2363 value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; 2364 } 2365 } 2366 output_data[Offset(output_dims, out_d, 0, 0, out_b)] = 2367 value / (input_width * input_height); 2368 } 2369 } 2370} 2371 2372template <typename T> 2373void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, 2374 const Dims<4>& input2_dims, T* output_data, 2375 const Dims<4>& output_dims) { 2376 NdArrayDesc<4> desc1; 2377 NdArrayDesc<4> desc2; 2378 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 2379 2380 // In Tensorflow, the dimensions are canonically named (batch_number, row, 2381 // col, channel), with extents (batches, height, width, depth), with the 2382 // trailing dimension changing most rapidly (channels has the smallest stride, 2383 // typically 1 element). 2384 // 2385 // In generated C code, we store arrays with the dimensions reversed. The 2386 // first dimension has smallest stride. 2387 // 2388 // We name our variables by their Tensorflow convention, but generate C code 2389 // nesting loops such that the innermost loop has the smallest stride for the 2390 // best cache behavior. 2391 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 2392 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 2393 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 2394 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 2395 output_data[Offset(output_dims, c, x, y, b)] = 2396 input1_data[SubscriptToIndex(desc1, c, x, y, b)] - 2397 input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 2398 } 2399 } 2400 } 2401 } 2402} 2403 2404template <typename T> 2405void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, 2406 const T* input2_data, T* output_data, 2407 const Dims<4>& output_dims) { 2408 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2409 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2410 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2411 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2412 2413 auto min_value = input2_data[0]; 2414 2415 for (int b = 0; b < batches; b++) { 2416 for (int y = 0; y < input_height; y++) { 2417 for (int x = 0; x < input_width; x++) { 2418 for (int c = 0; c < depth; c++) { 2419 int offset = Offset(input1_dims, c, x, y, b); 2420 output_data[offset] = 2421 input1_data[offset] > min_value ? min_value : input1_data[offset]; 2422 } 2423 } 2424 } 2425 } 2426} 2427 2428template <typename T> 2429void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, 2430 const T* input2_data, T* output_data, 2431 const Dims<4>& output_dims) { 2432 int batches = MatchingArraySize(input1_dims, 3, output_dims, 3); 2433 int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2); 2434 int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1); 2435 int depth = MatchingArraySize(input1_dims, 0, output_dims, 0); 2436 2437 auto max_value = input2_data[0]; 2438 2439 for (int b = 0; b < batches; b++) { 2440 for (int y = 0; y < input_height; y++) { 2441 for (int x = 0; x < input_width; x++) { 2442 for (int c = 0; c < depth; c++) { 2443 int offset = Offset(input1_dims, c, x, y, b); 2444 output_data[offset] = 2445 input1_data[offset] < max_value ? max_value : input1_data[offset]; 2446 } 2447 } 2448 } 2449 } 2450} 2451 2452template <typename T1, typename T2, typename T3> 2453void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, 2454 T2* output_data, const Dims<4>& output_dims) { 2455 // The current ArgMax implemention can only determine the index of the maximum 2456 // value in the last dimension. So the axis argument is ignored. 2457 TFLITE_DCHECK_EQ(axis[0], 3); 2458 2459 // For ArgMax, the number of output dimensions = (number of input dimensions - 2460 // 1). For the sake of simplicity, the output dimensions are equal to the 2461 // input dimensions here. We enforce the constraint that the last dimension 2462 // must always be 1. 2463 TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); 2464 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2465 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 2466 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 2467 const int depth = ArraySize(input_dims, 0); 2468 for (int b = 0; b < batches; ++b) { 2469 for (int y = 0; y < height; ++y) { 2470 for (int x = 0; x < width; ++x) { 2471 auto max_value = input_data[Offset(input_dims, 0, x, y, b)]; 2472 int max_index = 0; 2473 for (int d = 1; d < depth; ++d) { 2474 const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)]; 2475 if (curr_value > max_value) { 2476 max_value = curr_value; 2477 max_index = d; 2478 } 2479 } 2480 output_data[Offset(output_dims, 0, x, y, b)] = max_index; 2481 } 2482 } 2483 } 2484} 2485 2486} // namespace reference_ops 2487} // namespace tflite 2488 2489#endif // THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_REFERENCE_REFERENCE_OPS_H_ 2490