1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14==============================================================================*/ 15#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ 16#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ 17 18#include <assert.h> 19#include <stdint.h> 20#include <sys/types.h> 21#include <algorithm> 22#include <cmath> 23#include <limits> 24#include <memory> 25#include <tuple> 26#include <type_traits> 27 28#include "Eigen/Core" 29#include "unsupported/Eigen/CXX11/Tensor" 30#include "fixedpoint/fixedpoint.h" 31#include "public/gemmlowp.h" 32#include "tensorflow/contrib/lite/kernels/internal/common.h" 33#include "tensorflow/contrib/lite/kernels/internal/round.h" 34#include "tensorflow/contrib/lite/kernels/internal/types.h" 35 36namespace tflite { 37namespace optimized_ops { 38 39// Make a local VectorMap typedef allowing to map a float array 40// as a Eigen vector expression. The std::conditional here is to 41// construct the suitable Eigen type for the constness of the 42// data. Indeed, for const data, we need to produce 43// Eigen::Map<const Eigen::Matrix<float, ...>> 44// and not the more straightforward 45// Eigen::Map<Eigen::Matrix<const float, ...>> 46template <typename Scalar> 47using VectorMap = typename std::conditional< 48 std::is_const<Scalar>::value, 49 Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, 50 Eigen::Dynamic, 1>>, 51 Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; 52 53template <typename Scalar, int N> 54VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) { 55 const int size = RequiredBufferSizeForDims(dims); 56 return VectorMap<Scalar>(data, size, 1); 57} 58 59// Make a local VectorMap typedef allowing to map a float array 60// as a Eigen matrix expression. The same explanation as for VectorMap 61// above also applies here. 62template <typename Scalar> 63using MatrixMap = typename std::conditional< 64 std::is_const<Scalar>::value, 65 Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, 66 Eigen::Dynamic, Eigen::Dynamic>>, 67 Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; 68 69template <typename Scalar, int N> 70MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, 71 const Dims<N>& dims) { 72 const int rows = dims.sizes[0]; 73 int cols = 1; 74 for (int d = 1; d < N; d++) { 75 cols *= dims.sizes[d]; 76 } 77 return MatrixMap<Scalar>(data, rows, cols); 78} 79 80template <typename Scalar, int N> 81MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, 82 const Dims<N>& dims) { 83 const int cols = dims.sizes[N - 1]; 84 int rows = 1; 85 for (int d = 0; d < N - 1; d++) { 86 rows *= dims.sizes[d]; 87 } 88 return MatrixMap<Scalar>(data, rows, cols); 89} 90 91template <typename Scalar> 92using ArrayMap = typename std::conditional< 93 std::is_const<Scalar>::value, 94 Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type, 95 Eigen::Dynamic, Eigen::Dynamic>>, 96 Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; 97 98template <typename Scalar, int N> 99ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data, 100 const Dims<N>& dims) { 101 const int rows = dims.sizes[0]; 102 int cols = 1; 103 for (int d = 1; d < N; d++) { 104 cols *= dims.sizes[d]; 105 } 106 return ArrayMap<Scalar>(data, rows, cols); 107} 108 109// TODO(b/62193649): this function is only needed as long 110// as we have the --variable_batch hack. 111template <typename Scalar, int N> 112MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data, 113 const Dims<N>& dims, 114 int rows) { 115 int cols = 1; 116 bool matched_rows = false; 117 for (int d = 0; d < N; d++) { 118 cols *= dims.sizes[d]; 119 if (cols == rows) { 120 matched_rows = true; 121 cols = 1; 122 } 123 } 124 TFLITE_DCHECK(matched_rows); 125 return MatrixMap<Scalar>(data, rows, cols); 126} 127 128// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE 129// BROADCASTING. 130// 131// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional 132// rectangular array of numbers. 133// 134// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. 135// However, as Dims<N> is to be deprecated, this class exists as an adaptor 136// to enable simple unoptimized implementations of element-wise broadcasting 137// operations. 138template <int N> 139struct NdArrayDesc { 140 // The "extent" of each dimension. Indices along dimension d must be in the 141 // half-open interval [0, extents[d]). 142 int extents[N]; 143 144 // The number of *elements* (not bytes) between consecutive indices of each 145 // dimension. 146 int strides[N]; 147}; 148 149// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING 150// ELEMENT-WISE BROADCASTING. 151// 152// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. 153inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, 154 int i3) { 155 TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]); 156 TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]); 157 TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]); 158 TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]); 159 return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + 160 i3 * desc.strides[3]; 161} 162 163// Given the dimensions of the operands for an element-wise binary broadcast, 164// adjusts them so that they can be directly iterated over with simple loops. 165// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and 166// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. 167// 168// This function assumes that the two input shapes are compatible up to 169// broadcasting and the shorter one has already been prepended with 1s to be the 170// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), 171// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that 172// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be 173// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). 174// 175// When two shapes are compatible up to broadcasting, for each dimension d, 176// the input extents are either equal, or one of them is 1. 177// 178// This function performs the following for each dimension d: 179// - If the extents are equal, then do nothing since the loop that walks over 180// both of the input arrays is correct. 181// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 182// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows 183// array0 to be referenced *at any index* in dimension d and still access the 184// same slice. 185template <int N> 186inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, 187 const Dims<N>& input1_dims, 188 NdArrayDesc<N>* desc0_out, 189 NdArrayDesc<N>* desc1_out) { 190 TFLITE_DCHECK(desc0_out != nullptr); 191 TFLITE_DCHECK(desc1_out != nullptr); 192 193 // Copy dims to desc. 194 for (int i = 0; i < N; ++i) { 195 desc0_out->extents[i] = input0_dims.sizes[i]; 196 desc0_out->strides[i] = input0_dims.strides[i]; 197 desc1_out->extents[i] = input1_dims.sizes[i]; 198 desc1_out->strides[i] = input1_dims.strides[i]; 199 } 200 201 // Walk over each dimension. If the extents are equal do nothing. 202 // Otherwise, set the desc with extent 1 to have extent equal to the other and 203 // stride 0. 204 for (int i = 0; i < N; ++i) { 205 const int extent0 = ArraySize(input0_dims, i); 206 const int extent1 = ArraySize(input1_dims, i); 207 if (extent0 != extent1) { 208 if (extent0 == 1) { 209 desc0_out->strides[i] = 0; 210 desc0_out->extents[i] = extent1; 211 } else { 212 TFLITE_DCHECK_EQ(extent1, 1); 213 desc1_out->strides[i] = 0; 214 desc1_out->extents[i] = extent0; 215 } 216 } 217 } 218} 219 220inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) { 221 for (int i = 0; i < 4; i++) { 222 if (dims1.sizes[i] != dims2.sizes[i]) { 223 return false; 224 } 225 } 226 return true; 227} 228 229inline void AddBiasAndEvalActivationFunction(const float* bias_data, 230 const Dims<4>& bias_dims, 231 float* array_data, 232 const Dims<4>& array_dims, 233 float output_activation_min, 234 float output_activation_max) { 235#ifdef USE_NEON 236 gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); 237 const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; 238 const int array_size = array_dims.sizes[3] * array_dims.strides[3]; 239 TFLITE_DCHECK_EQ((array_size % bias_size), 0); 240 float* array_ptr = array_data; 241 float* array_end_ptr = array_ptr + array_size; 242 const auto activation_min = vdupq_n_f32(output_activation_min); 243 const auto activation_max = vdupq_n_f32(output_activation_max); 244 for (; array_ptr != array_end_ptr; array_ptr += bias_size) { 245 int i = 0; 246 for (; i <= bias_size - 16; i += 16) { 247 auto b0 = vld1q_f32(bias_data + i); 248 auto b1 = vld1q_f32(bias_data + i + 4); 249 auto b2 = vld1q_f32(bias_data + i + 8); 250 auto b3 = vld1q_f32(bias_data + i + 12); 251 auto a0 = vld1q_f32(array_ptr + i); 252 auto a1 = vld1q_f32(array_ptr + i + 4); 253 auto a2 = vld1q_f32(array_ptr + i + 8); 254 auto a3 = vld1q_f32(array_ptr + i + 12); 255 auto x0 = vaddq_f32(a0, b0); 256 auto x1 = vaddq_f32(a1, b1); 257 auto x2 = vaddq_f32(a2, b2); 258 auto x3 = vaddq_f32(a3, b3); 259 x0 = vmaxq_f32(activation_min, x0); 260 x1 = vmaxq_f32(activation_min, x1); 261 x2 = vmaxq_f32(activation_min, x2); 262 x3 = vmaxq_f32(activation_min, x3); 263 x0 = vminq_f32(activation_max, x0); 264 x1 = vminq_f32(activation_max, x1); 265 x2 = vminq_f32(activation_max, x2); 266 x3 = vminq_f32(activation_max, x3); 267 vst1q_f32(array_ptr + i, x0); 268 vst1q_f32(array_ptr + i + 4, x1); 269 vst1q_f32(array_ptr + i + 8, x2); 270 vst1q_f32(array_ptr + i + 12, x3); 271 } 272 for (; i <= bias_size - 4; i += 4) { 273 auto b = vld1q_f32(bias_data + i); 274 auto a = vld1q_f32(array_ptr + i); 275 auto x = vaddq_f32(a, b); 276 x = vmaxq_f32(activation_min, x); 277 x = vminq_f32(activation_max, x); 278 vst1q_f32(array_ptr + i, x); 279 } 280 for (; i < bias_size; i++) { 281 array_ptr[i] = ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], 282 output_activation_min, 283 output_activation_max); 284 } 285 } 286#else // not NEON 287 gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); 288 const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; 289 const int array_size = array_dims.sizes[3] * array_dims.strides[3]; 290 TFLITE_DCHECK_EQ((array_size % bias_size), 0); 291 for (int array_offset = 0; array_offset < array_size; 292 array_offset += bias_size) { 293 for (int i = 0; i < bias_size; i++) { 294 array_data[array_offset + i] = ActivationFunctionWithMinMax( 295 array_data[array_offset + i] + bias_data[i], output_activation_min, 296 output_activation_max); 297 } 298 } 299#endif 300} 301 302// legacy, for compatibility with old checked-in code 303template <FusedActivationFunctionType Ac> 304void AddBiasAndEvalActivationFunction(const float* bias_data, 305 const Dims<4>& bias_dims, 306 float* array_data, 307 const Dims<4>& array_dims) { 308 float output_activation_min, output_activation_max; 309 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 310 AddBiasAndEvalActivationFunction(bias_data, bias_dims, array_data, array_dims, 311 output_activation_min, 312 output_activation_max); 313} 314 315template <typename Lhs, typename Rhs, typename Result> 316void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs, 317 Eigen::MatrixBase<Result>* result) { 318 if (rhs.cols() == 1) { 319 gemmlowp::ScopedProfilingLabel label("GEMV"); 320 result->col(0).noalias() = lhs * rhs.col(0); 321 } else { 322 gemmlowp::ScopedProfilingLabel label("GEMM"); 323 result->noalias() = lhs * rhs; 324 } 325} 326 327inline void FullyConnected(const float* input_data, const Dims<4>& input_dims, 328 const float* weights_data, 329 const Dims<4>& weights_dims, const float* bias_data, 330 const Dims<4>& bias_dims, 331 float output_activation_min, 332 float output_activation_max, float* output_data, 333 const Dims<4>& output_dims) { 334 gemmlowp::ScopedProfilingLabel label("FullyConnected"); 335 // TODO(b/62193649): this convoluted shape computation (determining 336 // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows) 337 // is because the current --variable_batch hack consists in overwriting the 338 // 3rd dimension with the runtime batch size, as we don't keep track for each 339 // array of which dimension is the batch dimension in it. 340 // When that is fixed, this should become: 341 // const auto input_matrix_map = 342 // MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 343 const int input_rows = ArraySize(weights_dims, 0); 344 const auto input_matrix_map = 345 MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows); 346 const auto filter_matrix_map = 347 MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims); 348 auto output_matrix_map = 349 MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 350 351 Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); 352 AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data, 353 output_dims, output_activation_min, 354 output_activation_max); 355} 356 357// legacy, for compatibility with old checked-in code 358template <FusedActivationFunctionType Ac> 359void FullyConnected(const float* input_data, const Dims<4>& input_dims, 360 const float* weights_data, const Dims<4>& weights_dims, 361 const float* bias_data, const Dims<4>& bias_dims, 362 float* output_data, const Dims<4>& output_dims) { 363 float output_activation_min, output_activation_max; 364 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 365 FullyConnected(input_data, input_dims, weights_data, weights_dims, bias_data, 366 bias_dims, output_activation_min, output_activation_max, 367 output_data, output_dims); 368} 369 370inline void preload_l1_stream(const uint8* ptr) { 371#ifdef GEMMLOWP_ARM_64 372 asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :); 373#else 374 gemmlowp::Prefetch(ptr); 375#endif 376} 377 378#ifdef USE_NEON 379inline void FullyConnectedAsGEMV( 380 const uint8* input_data, const Dims<4>& input_dims, int32 input_offset, 381 const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset, 382 const int32* bias_data, const Dims<4>& bias_dims, int32 output_offset, 383 int32 output_multiplier, int output_shift, int32 output_activation_min, 384 int32 output_activation_max, uint8* output_data, 385 const Dims<4>& output_dims) { 386 gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit"); 387 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 388 TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); 389 TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims)); 390 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 391 TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 392 ArraySize(output_dims, 3), 393 1); 394 const int input_size = input_dims.strides[3]; 395 const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); 396 static constexpr int kPeel = 4; 397 for (int k = 0; k < input_size; k += 64) { 398 preload_l1_stream(input_data + k); 399 } 400 for (int k = 0; k < kPeel * input_size; k += 64) { 401 preload_l1_stream(filter_data + k); 402 } 403 TFLITE_DCHECK(!(output_size % kPeel)); 404 const int32* bias_ptr = bias_data; 405 uint8* output_ptr = output_data; 406 for (int out = 0; out < output_size; out += kPeel) { 407 int32x4_t acc[kPeel]; 408 for (int k = 0; k < kPeel; k++) { 409 acc[k] = vdupq_n_s32(0); 410 } 411 const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); 412 const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset); 413 int in = 0; 414 for (; in <= input_size - 16; in += 16) { 415 const uint8x16_t input_val_u8 = vld1q_u8(input_data + in); 416 uint8x16_t filter_val_u8[kPeel]; 417 for (int k = 0; k < kPeel; k++) { 418 const uint8* filter_ptr = filter_data + in + (out + k) * input_size; 419 filter_val_u8[k] = vld1q_u8(filter_ptr); 420 preload_l1_stream(filter_ptr + 64); 421 } 422 int16x8_t input_val[2]; 423 const uint8x8_t low = vget_low_u8(input_val_u8); 424 const uint8x8_t high = vget_high_u8(input_val_u8); 425 input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low)); 426 input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high)); 427 input_val[0] = vaddq_s16(input_val[0], input_offset_vec); 428 input_val[1] = vaddq_s16(input_val[1], input_offset_vec); 429 int16x8_t filter_val[kPeel][2]; 430 for (int k = 0; k < kPeel; k++) { 431 const uint8x8_t low = vget_low_u8(filter_val_u8[k]); 432 const uint8x8_t high = vget_high_u8(filter_val_u8[k]); 433 filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low)); 434 filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high)); 435 filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec); 436 filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec); 437 } 438 for (int p = 0; p < 2; p++) { 439 for (int k = 0; k < kPeel; k++) { 440 acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]), 441 vget_low_s16(input_val[p])); 442 } 443 for (int k = 0; k < kPeel; k++) { 444 acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]), 445 vget_high_s16(input_val[p])); 446 } 447 } 448 } 449 for (; in <= input_size - 8; in += 8) { 450 const uint8x8_t input_val_u8 = vld1_u8(input_data + in); 451 uint8x8_t filter_val_u8[kPeel]; 452 for (int k = 0; k < kPeel; k++) { 453 const uint8* filter_ptr = filter_data + in + (out + k) * input_size; 454 filter_val_u8[k] = vld1_u8(filter_ptr); 455 } 456 int16x8_t input_val; 457 input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8)); 458 input_val = vaddq_s16(input_val, input_offset_vec); 459 int16x8_t filter_val[kPeel]; 460 for (int k = 0; k < kPeel; k++) { 461 filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k])); 462 filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec); 463 } 464 for (int k = 0; k < kPeel; k++) { 465 acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]), 466 vget_low_s16(input_val)); 467 } 468 for (int k = 0; k < kPeel; k++) { 469 acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]), 470 vget_high_s16(input_val)); 471 } 472 } 473 if (in < input_size) { 474 int32 buf[4 * kPeel]; 475 for (int k = 0; k < 4; k++) { 476 vst1q_s32(buf + 4 * k, acc[k]); 477 } 478 for (; in < input_size; in++) { 479 int lane = (in + 8 - input_size) % 4; 480 const int32 input_val = input_data[in] + input_offset; 481 for (int k = 0; k < kPeel; k++) { 482 int32 filter_val = 483 filter_data[in + (out + k) * input_size] + filter_offset; 484 buf[lane + 4 * k] += filter_val * input_val; 485 } 486 } 487 for (int k = 0; k < 4; k++) { 488 acc[k] = vld1q_s32(buf + 4 * k); 489 } 490 } 491 492 // Horizontally reduce accumulators 493 int32x2_t pairwise_reduced_acc[kPeel]; 494 for (int k = 0; k < kPeel; k++) { 495 pairwise_reduced_acc[k] = 496 vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k])); 497 } 498 static_assert(kPeel == 4, "the code below currently assumes kPeel = 4"); 499 const int32x2_t reduced_lo = 500 vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]); 501 const int32x2_t reduced_hi = 502 vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]); 503 int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); 504 // Add bias values. 505 int32x4_t bias_vec = vld1q_s32(bias_ptr); 506 bias_ptr += 4; 507 reduced = vaddq_s32(reduced, bias_vec); 508 // Multiply by the fixed-point multiplier. 509 reduced = vqrdmulhq_n_s32(reduced, output_multiplier); 510 // Rounding-shift-right. 511 using gemmlowp::RoundingDivideByPOT; 512 reduced = RoundingDivideByPOT(reduced, output_shift); 513 // Add the output offset. 514 const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); 515 reduced = vaddq_s32(reduced, output_offset_vec); 516 // Narrow values down to 16 bit signed. 517 const int16x4_t res16 = vqmovn_s32(reduced); 518 // Narrow values down to 8 bit unsigned, saturating. 519 uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16)); 520 // Apply the clamping from the activation function 521 res8 = vmax_u8(res8, vdup_n_u8(output_activation_min)); 522 res8 = vmin_u8(res8, vdup_n_u8(output_activation_max)); 523 // Store results to destination. Assumes 32bit alignment. 524 vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr), 525 vreinterpret_u32_u8(res8), 0); 526 output_ptr += kPeel; 527 } 528} 529#endif // USE_NEON 530 531struct GemmlowpOutputPipeline { 532 typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> 533 ColVectorMap; 534 typedef std::tuple< 535 gemmlowp::OutputStageBiasAddition<ColVectorMap>, 536 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, 537 gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> 538 Pipeline; 539 static Pipeline Make(const int32* bias_data, int output_rows, 540 int32 output_offset, int32 output_multiplier, 541 int output_shift, int32 output_activation_min, 542 int32 output_activation_max) { 543 ColVectorMap bias_vector(bias_data, output_rows); 544 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; 545 bias_addition_stage.bias_vector = bias_vector; 546 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint 547 quantize_down_stage; 548 quantize_down_stage.result_offset_after_shift = output_offset; 549 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; 550 quantize_down_stage.result_shift = output_shift; 551 gemmlowp::OutputStageClamp clamp_stage; 552 clamp_stage.min = output_activation_min; 553 clamp_stage.max = output_activation_max; 554 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; 555 return std::make_tuple(bias_addition_stage, quantize_down_stage, 556 clamp_stage, saturating_cast_stage); 557 } 558}; 559 560inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 561 int32 input_offset, const uint8* filter_data, 562 const Dims<4>& filter_dims, int32 filter_offset, 563 const int32* bias_data, const Dims<4>& bias_dims, 564 int32 output_offset, int32 output_multiplier, 565 int output_shift, int32 output_activation_min, 566 int32 output_activation_max, uint8* output_data, 567 const Dims<4>& output_dims, 568 gemmlowp::GemmContext* gemm_context) { 569 gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit"); 570 // TODO(benoitjacob): This really should be: 571 // const int batches = ArraySize(output_dims, 1); 572 // but the current --variable_batch hack consists in overwriting the 3rd 573 // dimension with the runtime batch size, as we don't keep track for each 574 // array of which dimension is the batch dimension in it. 575 const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 576 ArraySize(output_dims, 3); 577#ifdef USE_NEON 578 const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); 579 if (batches == 1 && !(output_size % 4)) { 580 return FullyConnectedAsGEMV( 581 input_data, input_dims, input_offset, filter_data, filter_dims, 582 filter_offset, bias_data, bias_dims, output_offset, output_multiplier, 583 output_shift, output_activation_min, output_activation_max, output_data, 584 output_dims); 585 } 586#endif // USE_NEON 587 const int filter_rows = filter_dims.sizes[1]; 588 const int filter_cols = filter_dims.sizes[0]; 589 TFLITE_DCHECK_EQ(filter_dims.sizes[2], 1); 590 TFLITE_DCHECK_EQ(filter_dims.sizes[3], 1); 591 const int output_rows = output_dims.sizes[0]; 592 TFLITE_DCHECK_EQ(output_rows, filter_rows); 593 TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); 594 TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); 595 TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); 596 TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); 597 598 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( 599 filter_data, output_rows, filter_cols, filter_cols); 600 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( 601 input_data, filter_cols, batches, filter_cols); 602 gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( 603 output_data, output_rows, batches, output_rows); 604 const auto& output_pipeline = GemmlowpOutputPipeline::Make( 605 bias_data, output_rows, output_offset, output_multiplier, output_shift, 606 output_activation_min, output_activation_max); 607 gemmlowp::GemmWithOutputPipeline<uint8, uint8, 608 gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 609 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, 610 input_offset, output_pipeline); 611} 612 613// legacy, for compatibility with old checked-in code 614template <FusedActivationFunctionType Ac> 615void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, 616 int32 input_offset, const uint8* filter_data, 617 const Dims<4>& filter_dims, int32 filter_offset, 618 const int32* bias_data, const Dims<4>& bias_dims, 619 int32 output_offset, int32 output_multiplier, 620 int output_shift, int32 output_activation_min, 621 int32 output_activation_max, uint8* output_data, 622 const Dims<4>& output_dims, 623 gemmlowp::GemmContext* gemm_context) { 624 static_assert(Ac == FusedActivationFunctionType::kNone || 625 Ac == FusedActivationFunctionType::kRelu || 626 Ac == FusedActivationFunctionType::kRelu6 || 627 Ac == FusedActivationFunctionType::kRelu1, 628 ""); 629 FullyConnected(input_data, input_dims, input_offset, filter_data, filter_dims, 630 filter_offset, bias_data, bias_dims, output_offset, 631 output_multiplier, output_shift, output_activation_min, 632 output_activation_max, output_data, output_dims, gemm_context); 633} 634 635template <typename T> 636inline void ExtractPatchIntoBufferColumn( 637 const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, 638 int stride_width, int stride_height, int pad_width, int pad_height, 639 int in_width, int in_height, int in_depth, int single_buffer_length, 640 int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) { 641 gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn"); 642 // This chunk of code reshapes all the inputs corresponding to 643 // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). 644 const int kwidth_times_indepth = kwidth * in_depth; 645 const int inwidth_times_indepth = in_width * in_depth; 646 const int ih_ungated_start = h * stride_height - pad_height; 647 const int ih_ungated_end = (ih_ungated_start + kheight); 648 const int ih_end = std::min(ih_ungated_end, in_height); 649 const int iw_ungated_start = w * stride_width - pad_width; 650 const int iw_ungated_end = (iw_ungated_start + kwidth); 651 const int iw_end = std::min(iw_ungated_end, in_width); 652 // If the patch is off the edge of the input image, skip writing those rows 653 // and columns from the patch into the output array. 654 const int h_offset = std::max(0, -ih_ungated_start); 655 const int w_offset = std::max(0, -iw_ungated_start); 656 const int ih_start = std::max(0, ih_ungated_start); 657 const int iw_start = std::max(0, iw_ungated_start); 658 const int single_row_num = 659 std::min(kwidth - w_offset, in_width - iw_start) * in_depth; 660 const int output_row_offset = (buffer_id * single_buffer_length); 661 int out_offset = 662 output_row_offset + (h_offset * kwidth + w_offset) * in_depth; 663 int in_offset = Offset(input_dims, 0, iw_start, ih_start, b); 664 665 // Express all of the calculations as padding around the input patch. 666 const int top_padding = h_offset; 667 const int bottom_padding = (ih_ungated_end - ih_end); 668 const int left_padding = w_offset; 669 const int right_padding = (iw_ungated_end - iw_end); 670 assert(single_row_num == 671 ((kwidth - (left_padding + right_padding)) * in_depth)); 672 673 // Write out zeroes to the elements representing the top rows of the input 674 // patch that are off the edge of the input image. 675 if (top_padding > 0) { 676 const int top_row_elements = (top_padding * kwidth * in_depth); 677 memset(conv_buffer_data + output_row_offset, byte_zero, 678 (top_row_elements * sizeof(T))); 679 } 680 681 // If the patch is on the interior of the input image horizontally, just copy 682 // over the rows sequentially, otherwise add zero padding at the start or end. 683 if ((left_padding == 0) && (right_padding == 0)) { 684 for (int ih = ih_start; ih < ih_end; ++ih) { 685 memcpy(conv_buffer_data + out_offset, in_data + in_offset, 686 single_row_num * sizeof(T)); 687 out_offset += kwidth_times_indepth; 688 in_offset += inwidth_times_indepth; 689 } 690 } else { 691 for (int ih = ih_start; ih < ih_end; ++ih) { 692 if (left_padding > 0) { 693 const int left_start = (out_offset - (left_padding * in_depth)); 694 memset(conv_buffer_data + left_start, byte_zero, 695 (left_padding * in_depth * sizeof(T))); 696 } 697 memcpy(conv_buffer_data + out_offset, in_data + in_offset, 698 single_row_num * sizeof(T)); 699 if (right_padding > 0) { 700 const int right_start = (out_offset + single_row_num); 701 memset(conv_buffer_data + right_start, byte_zero, 702 (right_padding * in_depth * sizeof(T))); 703 } 704 out_offset += kwidth_times_indepth; 705 in_offset += inwidth_times_indepth; 706 } 707 } 708 709 // If the bottom of the patch falls off the input image, pad the values 710 // representing those input rows with zeroes. 711 if (bottom_padding > 0) { 712 const int bottom_row_elements = (bottom_padding * kwidth * in_depth); 713 const int bottom_start = 714 output_row_offset + 715 ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); 716 memset(conv_buffer_data + bottom_start, byte_zero, 717 (bottom_row_elements * sizeof(T))); 718 } 719} 720 721template <typename T> 722void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, 723 int stride_height, int pad_width, int pad_height, int kheight, 724 int kwidth, uint8 byte_zero, T* output_data, 725 const Dims<4>& output_dims) { 726 gemmlowp::ScopedProfilingLabel label("Im2col"); 727 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 728 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 729 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 730 const int input_depth = ArraySize(input_dims, 0); 731 const int input_width = ArraySize(input_dims, 1); 732 const int input_height = ArraySize(input_dims, 2); 733 const int output_depth = ArraySize(output_dims, 0); 734 const int output_width = ArraySize(output_dims, 1); 735 const int output_height = ArraySize(output_dims, 2); 736 737 int buffer_id = 0; 738 // Loop over the output nodes. 739 for (int b = 0; b < batches; ++b) { 740 for (int h = 0; h < output_height; ++h) { 741 for (int w = 0; w < output_width; ++w) { 742 ExtractPatchIntoBufferColumn( 743 input_dims, w, h, b, kheight, kwidth, stride_width, stride_height, 744 pad_width, pad_height, input_width, input_height, input_depth, 745 output_depth, buffer_id, input_data, output_data, byte_zero); 746 ++buffer_id; 747 } 748 } 749 } 750} 751 752// legacy, for compatibility with old checked-in code 753template <typename T> 754void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, 755 int pad_width, int pad_height, int kheight, int kwidth, 756 uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { 757 Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, 758 kwidth, byte_zero, output_data, output_dims); 759} 760 761inline void Conv(const float* input_data, const Dims<4>& input_dims, 762 const float* filter_data, const Dims<4>& filter_dims, 763 const float* bias_data, const Dims<4>& bias_dims, 764 int stride_width, int stride_height, int pad_width, 765 int pad_height, float output_activation_min, 766 float output_activation_max, float* output_data, 767 const Dims<4>& output_dims, float* im2col_data, 768 const Dims<4>& im2col_dims) { 769 (void)im2col_data; 770 (void)im2col_dims; 771 gemmlowp::ScopedProfilingLabel label("Conv"); 772 773 const float* gemm_input_data = nullptr; 774 const Dims<4>* gemm_input_dims = nullptr; 775 const int filter_width = ArraySize(filter_dims, 1); 776 const int filter_height = ArraySize(filter_dims, 2); 777 const bool need_im2col = stride_width != 1 || stride_height != 1 || 778 filter_width != 1 || filter_height != 1; 779 if (need_im2col) { 780 TFLITE_DCHECK(im2col_data); 781 Im2col(input_data, input_dims, stride_width, stride_height, pad_width, 782 pad_height, filter_height, filter_width, 0, im2col_data, 783 im2col_dims); 784 gemm_input_data = im2col_data; 785 gemm_input_dims = &im2col_dims; 786 } else { 787 // TODO(aselle): We need to make sure to not send im2col if it is not 788 // needed. 789 TFLITE_DCHECK(!im2col_data); 790 gemm_input_data = input_data; 791 gemm_input_dims = &input_dims; 792 } 793 794 const auto im2col_matrix_map = 795 MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims); 796 const auto filter_matrix_map = 797 MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); 798 auto output_matrix_map = 799 MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 800 801 Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map); 802 803 AddBiasAndEvalActivationFunction(bias_data, bias_dims, output_data, 804 output_dims, output_activation_min, 805 output_activation_max); 806} 807 808// legacy, for compatibility with old checked-in code 809template <FusedActivationFunctionType Ac> 810void Conv(const float* input_data, const Dims<4>& input_dims, 811 const float* filter_data, const Dims<4>& filter_dims, 812 const float* bias_data, const Dims<4>& bias_dims, int stride_width, 813 int stride_height, int pad_width, int pad_height, float* output_data, 814 const Dims<4>& output_dims, float* im2col_data, 815 const Dims<4>& im2col_dims) { 816 float output_activation_min, output_activation_max; 817 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 818 Conv(input_data, input_dims, filter_data, filter_dims, bias_data, bias_dims, 819 stride_width, stride_height, pad_width, pad_height, 820 output_activation_min, output_activation_max, output_data, output_dims, 821 im2col_data, im2col_dims); 822} 823 824// legacy, for compatibility with old checked-in code 825template <FusedActivationFunctionType Ac> 826void Conv(const float* input_data, const Dims<4>& input_dims, 827 const float* filter_data, const Dims<4>& filter_dims, 828 const float* bias_data, const Dims<4>& bias_dims, int stride, 829 int pad_width, int pad_height, float* output_data, 830 const Dims<4>& output_dims, float* im2col_data, 831 const Dims<4>& im2col_dims) { 832 Conv<Ac>(input_data, input_dims, filter_data, filter_dims, bias_data, 833 bias_dims, stride, stride, pad_width, pad_height, output_data, 834 output_dims, im2col_data, im2col_dims); 835} 836 837inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 838 int32 input_offset, const uint8* filter_data, 839 const Dims<4>& filter_dims, int32 filter_offset, 840 const int32* bias_data, const Dims<4>& bias_dims, 841 int stride_width, int stride_height, int pad_width, 842 int pad_height, int32 output_offset, int32 output_multiplier, 843 int output_shift, int32 output_activation_min, 844 int32 output_activation_max, uint8* output_data, 845 const Dims<4>& output_dims, uint8* im2col_data, 846 const Dims<4>& im2col_dims, 847 gemmlowp::GemmContext* gemm_context) { 848 gemmlowp::ScopedProfilingLabel label("Conv/8bit"); 849 850 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 851 TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims)); 852 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 853 854 const uint8* gemm_input_data = nullptr; 855 const Dims<4>* gemm_input_dims = nullptr; 856 const int filter_width = ArraySize(filter_dims, 1); 857 const int filter_height = ArraySize(filter_dims, 2); 858 const bool need_im2col = stride_width != 1 || stride_height != 1 || 859 filter_width != 1 || filter_height != 1; 860 if (need_im2col) { 861 TFLITE_DCHECK(im2col_data); 862 const int input_zero_point = -input_offset; 863 TFLITE_DCHECK_GE(input_zero_point, 0); 864 TFLITE_DCHECK_LE(input_zero_point, 255); 865 Im2col(input_data, input_dims, stride_width, stride_height, pad_width, 866 pad_height, filter_height, filter_width, input_zero_point, 867 im2col_data, im2col_dims); 868 gemm_input_data = im2col_data; 869 gemm_input_dims = &im2col_dims; 870 } else { 871 TFLITE_DCHECK(!im2col_data); 872 gemm_input_data = input_data; 873 gemm_input_dims = &input_dims; 874 } 875 876 const int gemm_input_rows = gemm_input_dims->sizes[0]; 877 const int gemm_input_cols = gemm_input_dims->sizes[1] * 878 gemm_input_dims->sizes[2] * 879 gemm_input_dims->sizes[3]; 880 const int filter_rows = filter_dims.sizes[3]; 881 const int filter_cols = 882 filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; 883 const int output_rows = output_dims.sizes[0]; 884 const int output_cols = 885 output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; 886 TFLITE_DCHECK_EQ(output_rows, filter_rows); 887 TFLITE_DCHECK_EQ(output_cols, gemm_input_cols); 888 TFLITE_DCHECK_EQ(filter_cols, gemm_input_rows); 889 TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); 890 TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); 891 TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); 892 TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); 893 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( 894 filter_data, filter_rows, filter_cols); 895 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( 896 gemm_input_data, gemm_input_rows, gemm_input_cols); 897 gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( 898 output_data, output_rows, output_cols); 899 const auto& output_pipeline = GemmlowpOutputPipeline::Make( 900 bias_data, output_rows, output_offset, output_multiplier, output_shift, 901 output_activation_min, output_activation_max); 902 gemmlowp::GemmWithOutputPipeline<uint8, uint8, 903 gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 904 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, 905 input_offset, output_pipeline); 906} 907 908// legacy, for compatibility with old checked-in code 909template <FusedActivationFunctionType Ac> 910inline void Conv(const uint8* input_data, const Dims<4>& input_dims, 911 int32 input_offset, const uint8* filter_data, 912 const Dims<4>& filter_dims, int32 filter_offset, 913 const int32* bias_data, const Dims<4>& bias_dims, 914 int stride_width, int stride_height, int pad_width, 915 int pad_height, int32 output_offset, int32 output_multiplier, 916 int output_shift, int32 output_activation_min, 917 int32 output_activation_max, uint8* output_data, 918 const Dims<4>& output_dims, uint8* im2col_data, 919 const Dims<4>& im2col_dims, 920 gemmlowp::GemmContext* gemm_context) { 921 static_assert(Ac == FusedActivationFunctionType::kNone || 922 Ac == FusedActivationFunctionType::kRelu || 923 Ac == FusedActivationFunctionType::kRelu6 || 924 Ac == FusedActivationFunctionType::kRelu1, 925 ""); 926 if (Ac == FusedActivationFunctionType::kNone) { 927 TFLITE_DCHECK_EQ(output_activation_min, 0); 928 TFLITE_DCHECK_EQ(output_activation_max, 255); 929 } 930 Conv(input_data, input_dims, input_offset, filter_data, filter_dims, 931 filter_offset, bias_data, bias_dims, stride_width, stride_height, 932 pad_width, pad_height, output_offset, output_multiplier, output_shift, 933 output_activation_min, output_activation_max, output_data, output_dims, 934 im2col_data, im2col_dims, gemm_context); 935} 936 937// legacy, for compatibility with old checked-in code 938template <FusedActivationFunctionType Ac> 939void Conv(const uint8* input_data, const Dims<4>& input_dims, 940 int32 input_offset, const uint8* filter_data, 941 const Dims<4>& filter_dims, int32 filter_offset, 942 const int32* bias_data, const Dims<4>& bias_dims, int stride, 943 int pad_width, int pad_height, int32 output_offset, 944 int32 output_multiplier, int output_shift, 945 int32 output_activation_min, int32 output_activation_max, 946 uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, 947 const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { 948 static_assert(Ac == FusedActivationFunctionType::kNone || 949 Ac == FusedActivationFunctionType::kRelu || 950 Ac == FusedActivationFunctionType::kRelu6 || 951 Ac == FusedActivationFunctionType::kRelu1, 952 ""); 953 Conv(input_data, input_dims, input_offset, filter_data, filter_dims, 954 filter_offset, bias_data, bias_dims, stride, stride, pad_width, 955 pad_height, output_offset, output_multiplier, output_shift, 956 output_activation_min, output_activation_max, output_data, output_dims, 957 im2col_data, im2col_dims, gemm_context); 958} 959 960template <typename T> 961inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, 962 int block_size, T* output_data, 963 const Dims<4>& output_dims) { 964 gemmlowp::ScopedProfilingLabel label("DepthToSpace"); 965 966 const int input_depth = ArraySize(input_dims, 0); 967 const int input_width = ArraySize(input_dims, 1); 968 const int input_height = ArraySize(input_dims, 2); 969 970 const int output_depth = ArraySize(output_dims, 0); 971 const int batch_size = ArraySize(output_dims, 3); 972 973 // Number of continuous values that we can copy in one interation. 974 const int stride = block_size * output_depth; 975 976 for (int batch = 0; batch < batch_size; ++batch) { 977 for (int in_h = 0; in_h < input_height; ++in_h) { 978 const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch); 979 for (int offset_h = 0; offset_h < block_size; ++offset_h) { 980 const T* src = input_ptr; 981 for (int in_w = 0; in_w < input_width; ++in_w) { 982 memcpy(output_data, src, stride * sizeof(T)); 983 output_data += stride; 984 src += input_depth; 985 } 986 input_ptr += stride; 987 } 988 } 989 } 990} 991 992// legacy, for compatibility with old checked-in code 993template <FusedActivationFunctionType Ac, typename T> 994void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, 995 int pad_width, int pad_height, int kheight, int kwidth, 996 uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { 997 Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, 998 kwidth, byte_zero, output_data, output_dims); 999} 1000 1001// legacy, for compatibility with old checked-in code 1002template <FusedActivationFunctionType Ac> 1003void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, 1004 const float* filter_data, const Dims<4>& filter_dims, 1005 const float* bias_data, const Dims<4>& bias_dims, 1006 float* output_data, const Dims<4>& output_dims) { 1007 gemmlowp::ScopedProfilingLabel label("ConvAsGemm"); 1008 1009 const auto input_matrix_map = 1010 MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 1011 const auto filter_matrix_map = 1012 MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); 1013 auto output_matrix_map = 1014 MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 1015 1016 Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); 1017 1018 AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, 1019 output_dims); 1020} 1021 1022// legacy, for compatibility with old checked-in code 1023template <FusedActivationFunctionType Ac> 1024void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, 1025 int32 input_offset, const uint8* filter_data, 1026 const Dims<4>& filter_dims, int32 filter_offset, 1027 const int32* bias_data, const Dims<4>& bias_dims, 1028 int32 output_offset, int32 output_multiplier, int output_shift, 1029 int32 output_activation_min, int32 output_activation_max, 1030 uint8* output_data, const Dims<4>& output_dims, 1031 gemmlowp::GemmContext* gemm_context) { 1032 gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit"); 1033 static_assert(Ac == FusedActivationFunctionType::kNone || 1034 Ac == FusedActivationFunctionType::kRelu || 1035 Ac == FusedActivationFunctionType::kRelu6 || 1036 Ac == FusedActivationFunctionType::kRelu1, 1037 ""); 1038 const int input_rows = input_dims.sizes[0]; 1039 const int input_cols = 1040 input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3]; 1041 const int filter_rows = filter_dims.sizes[3]; 1042 const int filter_cols = 1043 filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; 1044 const int output_rows = output_dims.sizes[0]; 1045 const int output_cols = 1046 output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; 1047 TFLITE_DCHECK_EQ(output_rows, filter_rows); 1048 TFLITE_DCHECK_EQ(output_cols, input_cols); 1049 TFLITE_DCHECK_EQ(filter_cols, input_rows); 1050 TFLITE_DCHECK_EQ(bias_dims.sizes[0], output_rows); 1051 TFLITE_DCHECK_EQ(bias_dims.sizes[1], 1); 1052 TFLITE_DCHECK_EQ(bias_dims.sizes[2], 1); 1053 TFLITE_DCHECK_EQ(bias_dims.sizes[3], 1); 1054 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( 1055 filter_data, output_rows, filter_cols, filter_cols); 1056 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( 1057 input_data, filter_cols, output_cols, filter_cols); 1058 gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( 1059 output_data, output_rows, output_cols, output_rows); 1060 const auto& output_pipeline = GemmlowpOutputPipeline::Make( 1061 bias_data, output_rows, output_offset, output_multiplier, output_shift, 1062 output_activation_min, output_activation_max); 1063 gemmlowp::GemmWithOutputPipeline<uint8, uint8, 1064 gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 1065 gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, 1066 input_offset, output_pipeline); 1067} 1068 1069template <typename T> 1070inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, 1071 int block_size, T* output_data, 1072 const Dims<4>& output_dims) { 1073 gemmlowp::ScopedProfilingLabel label("SpaceToDepth"); 1074 1075 const int output_depth = ArraySize(output_dims, 0); 1076 const int output_width = ArraySize(output_dims, 1); 1077 const int output_height = ArraySize(output_dims, 2); 1078 1079 const int input_depth = ArraySize(input_dims, 0); 1080 const int batch_size = ArraySize(input_dims, 3); 1081 1082 // Number of continuous values that we can copy in one interation. 1083 const int stride = block_size * input_depth; 1084 1085 for (int batch = 0; batch < batch_size; ++batch) { 1086 for (int out_h = 0; out_h < output_height; ++out_h) { 1087 T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch); 1088 for (int offset_h = 0; offset_h < block_size; ++offset_h) { 1089 T* dst = output_ptr; 1090 for (int out_w = 0; out_w < output_width; ++out_w) { 1091 memcpy(dst, input_data, stride * sizeof(T)); 1092 input_data += stride; 1093 dst += output_depth; 1094 } 1095 output_ptr += stride; 1096 } 1097 } 1098 } 1099} 1100 1101template <FusedActivationFunctionType Ac> 1102void NonGlobalBatchNormalization( 1103 const float* input_data, const Dims<4>& input_dims, const float* mean_data, 1104 const Dims<4>& mean_dims, const float* multiplier_data, 1105 const Dims<4>& multiplier_dims, const float* offset_data, 1106 const Dims<4>& offset_dims, float* output_data, 1107 const Dims<4>& output_dims) { 1108 gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization"); 1109 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1110 const int height = 1111 MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, 1112 offset_dims, 2, output_dims, 2); 1113 const int width = 1114 MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, 1115 offset_dims, 1, output_dims, 1); 1116 const int depth = 1117 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 1118 offset_dims, 0, output_dims, 0); 1119 1120 for (int b = 0; b < batches; ++b) { 1121 for (int y = 0; y < height; ++y) { 1122 for (int x = 0; x < width; ++x) { 1123 for (int c = 0; c < depth; ++c) { 1124 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 1125 (input_data[Offset(input_dims, c, x, y, b)] - 1126 mean_data[Offset(mean_dims, c, x, y, 0)]) * 1127 multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + 1128 offset_data[Offset(offset_dims, c, x, y, 0)]); 1129 } 1130 } 1131 } 1132 } 1133} 1134 1135template <FusedActivationFunctionType Ac> 1136void GlobalBatchNormalization(const float* input_data, 1137 const Dims<4>& input_dims, const float* mean_data, 1138 const Dims<4>& mean_dims, 1139 const float* multiplier_data, 1140 const Dims<4>& multiplier_dims, 1141 const float* offset_data, 1142 const Dims<4>& offset_dims, float* output_data, 1143 const Dims<4>& output_dims) { 1144 gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization"); 1145 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1146 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1147 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1148 const int depth = 1149 MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, 1150 offset_dims, 0, output_dims, 0); 1151 1152 for (int b = 0; b < batches; ++b) { 1153 for (int y = 0; y < height; ++y) { 1154 for (int x = 0; x < width; ++x) { 1155 for (int c = 0; c < depth; ++c) { 1156 output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( 1157 (input_data[Offset(input_dims, c, x, y, b)] - 1158 mean_data[Offset(mean_dims, c, 0, 0, 0)]) * 1159 multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + 1160 offset_data[Offset(offset_dims, c, 0, 0, 0)]); 1161 } 1162 } 1163 } 1164 } 1165} 1166 1167inline void Relu(const float* input_data, const Dims<4>& input_dims, 1168 float* output_data, const Dims<4>& output_dims) { 1169 gemmlowp::ScopedProfilingLabel label("Relu (not fused)"); 1170 1171 const auto input = MapAsVector(input_data, input_dims); 1172 auto output = MapAsVector(output_data, output_dims); 1173 output = input.cwiseMax(0.0f); 1174} 1175 1176inline void Relu1(const float* input_data, const Dims<4>& input_dims, 1177 float* output_data, const Dims<4>& output_dims) { 1178 gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); 1179 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1180 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1181 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1182 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1183 for (int b = 0; b < batches; ++b) { 1184 for (int y = 0; y < height; ++y) { 1185 for (int x = 0; x < width; ++x) { 1186 for (int c = 0; c < depth; ++c) { 1187 float val = input_data[Offset(input_dims, c, x, y, b)]; 1188 const float upper = 1; 1189 const float lower = -1; 1190 float clamped = val > upper ? upper : val < lower ? lower : val; 1191 output_data[Offset(output_dims, c, x, y, b)] = clamped; 1192 } 1193 } 1194 } 1195 } 1196} 1197 1198inline void Relu6(const float* input_data, const Dims<4>& input_dims, 1199 float* output_data, const Dims<4>& output_dims) { 1200 gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); 1201 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1202 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1203 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1204 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1205 for (int b = 0; b < batches; ++b) { 1206 for (int y = 0; y < height; ++y) { 1207 for (int x = 0; x < width; ++x) { 1208 for (int c = 0; c < depth; ++c) { 1209 float val = input_data[Offset(input_dims, c, x, y, b)]; 1210 const float upper = 6; 1211 const float lower = 0; 1212 float clamped = val > upper ? upper : val < lower ? lower : val; 1213 output_data[Offset(output_dims, c, x, y, b)] = clamped; 1214 } 1215 } 1216 } 1217 } 1218} 1219 1220template <FusedActivationFunctionType Ac> 1221void L2Normalization(const float* input_data, const Dims<4>& input_dims, 1222 float* output_data, const Dims<4>& output_dims) { 1223 gemmlowp::ScopedProfilingLabel label("L2Normalization"); 1224 static_assert(Ac == FusedActivationFunctionType::kNone, ""); 1225 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1226 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1227 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1228 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1229 for (int b = 0; b < batches; ++b) { 1230 for (int y = 0; y < height; ++y) { 1231 for (int x = 0; x < width; ++x) { 1232 float squared_l2_norm = 0; 1233 for (int c = 0; c < depth; ++c) { 1234 float val = input_data[Offset(input_dims, c, x, y, b)]; 1235 squared_l2_norm += val * val; 1236 } 1237 float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm); 1238 for (int c = 0; c < depth; ++c) { 1239 output_data[Offset(output_dims, c, x, y, b)] = 1240 input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm; 1241 } 1242 } 1243 } 1244 } 1245} 1246 1247inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, 1248 int* output_shift) { 1249 *output_shift = 11; 1250 while (input >= (1 << 29)) { 1251 input /= 4; 1252 ++*output_shift; 1253 } 1254 TFLITE_DCHECK_GT(input, 0); 1255 const unsigned max_left_shift_bits = __builtin_clz(input) - 1; 1256 const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; 1257 const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; 1258 *output_shift -= left_shift_bit_pairs; 1259 input <<= 2 * left_shift_bit_pairs; 1260 TFLITE_DCHECK_GE(input, (1 << 27)); 1261 TFLITE_DCHECK_LT(input, (1 << 29)); 1262 using gemmlowp::FixedPoint; 1263 using gemmlowp::Rescale; 1264 using gemmlowp::SaturatingRoundingMultiplyByPOT; 1265 // Using 3 integer bits gives us enough room for the internal arithmetic in 1266 // this Newton-Raphson iteration. 1267 using F3 = FixedPoint<int32, 3>; 1268 using F0 = FixedPoint<int32, 0>; 1269 const F3 fixedpoint_input = F3::FromRaw(input >> 1); 1270 const F3 fixedpoint_half_input = 1271 SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); 1272 const F3 fixedpoint_half_three = 1273 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); 1274 // Newton-Raphson iteration 1275 // Naive unoptimized starting guess: x = 1 1276 F3 x = F3::One(); 1277 // Naive unoptimized number of iterations: 5 1278 for (int i = 0; i < 5; i++) { 1279 const F3 x3 = Rescale<3>(x * x * x); 1280 x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); 1281 } 1282 const F0 fixedpoint_half_sqrt_2 = 1283 GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); 1284 x = x * fixedpoint_half_sqrt_2; 1285 *output_inv_sqrt = x.raw(); 1286 if (*output_shift < 0) { 1287 *output_inv_sqrt <<= -*output_shift; 1288 *output_shift = 0; 1289 } 1290} 1291 1292inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, 1293 int32 input_zero_point, uint8* output_data, 1294 const Dims<4>& output_dims) { 1295 gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit"); 1296 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 1297 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 1298 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 1299 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 1300 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 1301 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 1302 TFLITE_DCHECK_EQ(batches, 1); 1303 TFLITE_DCHECK_EQ(height, 1); 1304 TFLITE_DCHECK_EQ(width, 1); 1305 int32 square_l2_norm = 0; 1306 for (int i = 0; i < depth; i++) { 1307 int32 diff = input_data[i] - input_zero_point; 1308 square_l2_norm += diff * diff; 1309 } 1310 int32 inv_l2norm_multiplier; 1311 int inv_l2norm_shift; 1312 GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, 1313 &inv_l2norm_shift); 1314 1315 for (int i = 0; i < depth; i++) { 1316 int32 diff = input_data[i] - input_zero_point; 1317 int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( 1318 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); 1319 int32 unclamped_output_val = 128 + rescaled_diff; 1320 int32 output_val = std::min(255, std::max(0, unclamped_output_val)); 1321 output_data[i] = static_cast<uint8>(output_val); 1322 } 1323} 1324 1325inline void Add(const float* input1_data, const Dims<4>& input1_dims, 1326 const float* input2_data, const Dims<4>& input2_dims, 1327 float output_activation_min, float output_activation_max, 1328 float* output_data, const Dims<4>& output_dims) { 1329 gemmlowp::ScopedProfilingLabel label("Add"); 1330 /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, 1331 output_dims, 3); 1332 /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, 1333 output_dims, 2); 1334 /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, 1335 output_dims, 1); 1336 /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, 1337 output_dims, 0); 1338 TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); 1339 TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); 1340 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 1341 1342 int i = 0; 1343 const int size = input1_dims.sizes[3] * input1_dims.strides[3]; 1344#ifdef USE_NEON 1345 const auto activation_min = vdupq_n_f32(output_activation_min); 1346 const auto activation_max = vdupq_n_f32(output_activation_max); 1347 for (; i <= size - 16; i += 16) { 1348 auto a10 = vld1q_f32(input1_data + i); 1349 auto a11 = vld1q_f32(input1_data + i + 4); 1350 auto a12 = vld1q_f32(input1_data + i + 8); 1351 auto a13 = vld1q_f32(input1_data + i + 12); 1352 auto a20 = vld1q_f32(input2_data + i); 1353 auto a21 = vld1q_f32(input2_data + i + 4); 1354 auto a22 = vld1q_f32(input2_data + i + 8); 1355 auto a23 = vld1q_f32(input2_data + i + 12); 1356 auto x0 = vaddq_f32(a10, a20); 1357 auto x1 = vaddq_f32(a11, a21); 1358 auto x2 = vaddq_f32(a12, a22); 1359 auto x3 = vaddq_f32(a13, a23); 1360 x0 = vmaxq_f32(activation_min, x0); 1361 x1 = vmaxq_f32(activation_min, x1); 1362 x2 = vmaxq_f32(activation_min, x2); 1363 x3 = vmaxq_f32(activation_min, x3); 1364 x0 = vminq_f32(activation_max, x0); 1365 x1 = vminq_f32(activation_max, x1); 1366 x2 = vminq_f32(activation_max, x2); 1367 x3 = vminq_f32(activation_max, x3); 1368 vst1q_f32(output_data + i, x0); 1369 vst1q_f32(output_data + i + 4, x1); 1370 vst1q_f32(output_data + i + 8, x2); 1371 vst1q_f32(output_data + i + 12, x3); 1372 } 1373 for (; i <= size - 4; i += 4) { 1374 auto a1 = vld1q_f32(input1_data + i); 1375 auto a2 = vld1q_f32(input2_data + i); 1376 auto x = vaddq_f32(a1, a2); 1377 x = vmaxq_f32(activation_min, x); 1378 x = vminq_f32(activation_max, x); 1379 vst1q_f32(output_data + i, x); 1380 } 1381#endif // NEON 1382 1383 for (; i < size; i++) { 1384 auto x = input1_data[i] + input2_data[i]; 1385 output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min, 1386 output_activation_max); 1387 } 1388} 1389 1390// legacy, for compatibility with old checked-in code 1391template <FusedActivationFunctionType Ac> 1392void Add(const float* input1_data, const Dims<4>& input1_dims, 1393 const float* input2_data, const Dims<4>& input2_dims, 1394 float* output_data, const Dims<4>& output_dims) { 1395 float output_activation_min, output_activation_max; 1396 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1397 1398 Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 1399 output_activation_max, output_data, output_dims); 1400} 1401 1402template <FusedActivationFunctionType Ac> 1403inline void Add(int left_shift, const uint8* input1_data, 1404 const Dims<4>& input1_dims, int32 input1_offset, 1405 int32 input1_multiplier, int input1_shift, 1406 const uint8* input2_data, const Dims<4>& input2_dims, 1407 int32 input2_offset, int32 input2_multiplier, int input2_shift, 1408 int32 output_offset, int32 output_multiplier, int output_shift, 1409 int32 output_activation_min, int32 output_activation_max, 1410 uint8* output_data, const Dims<4>& output_dims) { 1411 static_assert(Ac == FusedActivationFunctionType::kNone || 1412 Ac == FusedActivationFunctionType::kRelu || 1413 Ac == FusedActivationFunctionType::kRelu6 || 1414 Ac == FusedActivationFunctionType::kRelu1, 1415 ""); 1416 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1417 if (Ac == FusedActivationFunctionType::kNone) { 1418 TFLITE_DCHECK_EQ(output_activation_min, 0); 1419 TFLITE_DCHECK_EQ(output_activation_max, 255); 1420 } 1421 gemmlowp::ScopedProfilingLabel label("Add/8bit"); 1422 /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, 1423 output_dims, 3); 1424 /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, 1425 output_dims, 2); 1426 /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, 1427 output_dims, 1); 1428 /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, 1429 output_dims, 0); 1430 TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); 1431 TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); 1432 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 1433 1434 int i = 0; 1435 const int size = input1_dims.sizes[3] * input1_dims.strides[3]; 1436 TFLITE_DCHECK_GT(input1_offset, -256); 1437 TFLITE_DCHECK_GT(input2_offset, -256); 1438 TFLITE_DCHECK_LT(input1_offset, 256); 1439 TFLITE_DCHECK_LT(input2_offset, 256); 1440#ifdef USE_NEON 1441 for (; i <= size - 8; i += 8) { 1442 const auto input1_val_original = vld1_u8(input1_data + i); 1443 const auto input2_val_original = vld1_u8(input2_data + i); 1444 const auto input1_val_s16 = 1445 vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); 1446 const auto input2_val_s16 = 1447 vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); 1448 const auto input1_val = 1449 vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset)); 1450 const auto input2_val = 1451 vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset)); 1452 const auto input1_val_high = vget_high_s16(input1_val); 1453 const auto input1_val_low = vget_low_s16(input1_val); 1454 const auto input2_val_high = vget_high_s16(input2_val); 1455 const auto input2_val_low = vget_low_s16(input2_val); 1456 auto x11 = vmovl_s16(input1_val_low); 1457 auto x12 = vmovl_s16(input1_val_high); 1458 auto x21 = vmovl_s16(input2_val_low); 1459 auto x22 = vmovl_s16(input2_val_high); 1460 const auto left_shift_dup = vdupq_n_s32(left_shift); 1461 x11 = vshlq_s32(x11, left_shift_dup); 1462 x12 = vshlq_s32(x12, left_shift_dup); 1463 x21 = vshlq_s32(x21, left_shift_dup); 1464 x22 = vshlq_s32(x22, left_shift_dup); 1465 x11 = vqrdmulhq_n_s32(x11, input1_multiplier); 1466 x12 = vqrdmulhq_n_s32(x12, input1_multiplier); 1467 x21 = vqrdmulhq_n_s32(x21, input2_multiplier); 1468 x22 = vqrdmulhq_n_s32(x22, input2_multiplier); 1469 const auto input1_shift_dup = vdupq_n_s32(-input1_shift); 1470 const auto input2_shift_dup = vdupq_n_s32(-input2_shift); 1471 x11 = vshlq_s32(x11, input1_shift_dup); 1472 x12 = vshlq_s32(x12, input1_shift_dup); 1473 x21 = vshlq_s32(x21, input2_shift_dup); 1474 x22 = vshlq_s32(x22, input2_shift_dup); 1475 auto s1 = vaddq_s32(x11, x21); 1476 auto s2 = vaddq_s32(x12, x22); 1477 s1 = vqrdmulhq_n_s32(s1, output_multiplier); 1478 s2 = vqrdmulhq_n_s32(s2, output_multiplier); 1479 using gemmlowp::RoundingDivideByPOT; 1480 s1 = RoundingDivideByPOT(s1, output_shift); 1481 s2 = RoundingDivideByPOT(s2, output_shift); 1482 const auto s1_narrowed = vmovn_s32(s1); 1483 const auto s2_narrowed = vmovn_s32(s2); 1484 const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), 1485 vdupq_n_s16(output_offset)); 1486 vst1_u8(output_data + i, vqmovun_s16(s)); 1487 } 1488#endif // NEON 1489 1490 for (; i < size; i++) { 1491 const int32 input1_val = input1_offset + input1_data[i]; 1492 const int32 input2_val = input2_offset + input2_data[i]; 1493 const int32 shifted_input1_val = input1_val * (1 << left_shift); 1494 const int32 shifted_input2_val = input2_val * (1 << left_shift); 1495 const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne( 1496 shifted_input1_val, input1_multiplier, input1_shift); 1497 const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne( 1498 shifted_input2_val, input2_multiplier, input2_shift); 1499 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 1500 const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne( 1501 raw_sum, output_multiplier, output_shift) + 1502 output_offset; 1503 const int32 clamped_output = std::min( 1504 output_activation_max, std::max(output_activation_min, raw_output)); 1505 output_data[i] = static_cast<uint8>(clamped_output); 1506 } 1507} 1508 1509template <FusedActivationFunctionType Ac> 1510void Add(const int32* input1_data, const Dims<4>& input1_dims, 1511 const int32* input2_data, const Dims<4>& input2_dims, 1512 int32* output_data, const Dims<4>& output_dims) { 1513 gemmlowp::ScopedProfilingLabel label("Add/int32"); 1514 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1515 1516 auto input1_map = MapAsVector(input1_data, input1_dims); 1517 auto input2_map = MapAsVector(input2_data, input2_dims); 1518 auto output_map = MapAsVector(output_data, output_dims); 1519 if (AreSameDims(input1_dims, input2_dims)) { 1520 output_map.array() = input1_map.array() + input2_map.array(); 1521 } else if (RequiredBufferSizeForDims(input2_dims) == 1) { 1522 auto scalar = input2_data[0]; 1523 output_map.array() = input1_map.array() + scalar; 1524 } else if (RequiredBufferSizeForDims(input1_dims) == 1) { 1525 auto scalar = input1_data[0]; 1526 output_map.array() = scalar + input2_map.array(); 1527 } else { 1528 // Should not come here. 1529 TFLITE_DCHECK(false); 1530 } 1531} 1532 1533// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary 1534// dimensionality if the runtime code does a single loop over one dimension 1535// that handles broadcasting as the base case. The code generator would then 1536// generate max(D1, D2) nested for loops. 1537// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from 1538// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> 1539// is no longer referenced in this file, move NdArrayDesc<T> from types.h to 1540// reference_ops.h. 1541template <typename T> 1542void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, 1543 const T* input2_data, const Dims<4>& input2_dims, 1544 T output_activation_min, T output_activation_max, 1545 T* output_data, const Dims<4>& output_dims) { 1546 gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); 1547 1548 NdArrayDesc<4> desc1; 1549 NdArrayDesc<4> desc2; 1550 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1551 1552 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1553 // col, channel), with extents (batches, height, width, depth), with the 1554 // trailing dimension changing most rapidly (channels has the smallest stride, 1555 // typically 1 element). 1556 // 1557 // In generated C code, we store arrays with the dimensions reversed. The 1558 // first dimension has smallest stride. 1559 // 1560 // We name our variables by their Tensorflow convention, but generate C code 1561 // nesting loops such that the innermost loop has the smallest stride for the 1562 // best cache behavior. 1563 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1564 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1565 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1566 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1567 output_data[Offset(output_dims, c, x, y, b)] = 1568 ActivationFunctionWithMinMax( 1569 input1_data[SubscriptToIndex(desc1, c, x, y, b)] + 1570 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 1571 output_activation_min, output_activation_max); 1572 } 1573 } 1574 } 1575 } 1576} 1577 1578// legacy, for compatibility with old checked-in code 1579template <FusedActivationFunctionType Ac, typename T> 1580void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims, 1581 const T* input2_data, const Dims<4>& input2_dims, 1582 T* output_data, const Dims<4>& output_dims) { 1583 T output_activation_min, output_activation_max; 1584 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1585 1586 BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims, 1587 output_activation_min, output_activation_max, output_data, 1588 output_dims); 1589} 1590 1591inline void BroadcastAdd(int left_shift, const uint8* input1_data, 1592 const Dims<4>& input1_dims, int32 input1_offset, 1593 int32 input1_multiplier, int input1_shift, 1594 const uint8* input2_data, const Dims<4>& input2_dims, 1595 int32 input2_offset, int32 input2_multiplier, 1596 int input2_shift, int32 output_offset, 1597 int32 output_multiplier, int output_shift, 1598 int32 output_activation_min, 1599 int32 output_activation_max, uint8* output_data, 1600 const Dims<4>& output_dims) { 1601 gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); 1602 1603 NdArrayDesc<4> desc1; 1604 NdArrayDesc<4> desc2; 1605 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1606 1607 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1608 // col, channel), with extents (batches, height, width, depth), with the 1609 // trailing dimension changing most rapidly (channels has the smallest stride, 1610 // typically 1 element). 1611 // 1612 // In generated C code, we store arrays with the dimensions reversed. The 1613 // first dimension has smallest stride. 1614 // 1615 // We name our variables by their Tensorflow convention, but generate C code 1616 // nesting loops such that the innermost loop has the smallest stride for the 1617 // best cache behavior. 1618 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1619 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1620 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1621 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1622 const int32 input1_val = 1623 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 1624 const int32 input2_val = 1625 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 1626 const int32 shifted_input1_val = input1_val * (1 << left_shift); 1627 const int32 shifted_input2_val = input2_val * (1 << left_shift); 1628 const int32 scaled_input1_val = 1629 MultiplyByQuantizedMultiplierSmallerThanOne( 1630 shifted_input1_val, input1_multiplier, input1_shift); 1631 const int32 scaled_input2_val = 1632 MultiplyByQuantizedMultiplierSmallerThanOne( 1633 shifted_input2_val, input2_multiplier, input2_shift); 1634 const int32 raw_sum = scaled_input1_val + scaled_input2_val; 1635 const int32 raw_output = 1636 MultiplyByQuantizedMultiplierSmallerThanOne( 1637 raw_sum, output_multiplier, output_shift) + 1638 output_offset; 1639 const int32 clamped_output = 1640 std::min(output_activation_max, 1641 std::max(output_activation_min, raw_output)); 1642 output_data[Offset(output_dims, c, x, y, b)] = 1643 static_cast<uint8>(clamped_output); 1644 } 1645 } 1646 } 1647 } 1648} 1649 1650template <FusedActivationFunctionType Ac> 1651inline void BroadcastAdd(int left_shift, const uint8* input1_data, 1652 const Dims<4>& input1_dims, int32 input1_offset, 1653 int32 input1_multiplier, int input1_shift, 1654 const uint8* input2_data, const Dims<4>& input2_dims, 1655 int32 input2_offset, int32 input2_multiplier, 1656 int input2_shift, int32 output_offset, 1657 int32 output_multiplier, int output_shift, 1658 int32 output_activation_min, 1659 int32 output_activation_max, uint8* output_data, 1660 const Dims<4>& output_dims) { 1661 static_assert(Ac == FusedActivationFunctionType::kNone || 1662 Ac == FusedActivationFunctionType::kRelu || 1663 Ac == FusedActivationFunctionType::kRelu6 || 1664 Ac == FusedActivationFunctionType::kRelu1, 1665 ""); 1666 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 1667 if (Ac == FusedActivationFunctionType::kNone) { 1668 TFLITE_DCHECK_EQ(output_activation_min, 0); 1669 TFLITE_DCHECK_EQ(output_activation_max, 255); 1670 } 1671 BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset, 1672 input1_multiplier, input1_shift, input2_data, input2_dims, 1673 input2_offset, input2_multiplier, input2_shift, output_offset, 1674 output_multiplier, output_shift, output_activation_min, 1675 output_activation_max, output_data, output_dims); 1676} 1677 1678inline void Mul(const float* input1_data, const Dims<4>& input1_dims, 1679 const float* input2_data, const Dims<4>& input2_dims, 1680 float output_activation_min, float output_activation_max, 1681 float* output_data, const Dims<4>& output_dims) { 1682 gemmlowp::ScopedProfilingLabel label("Mul"); 1683 /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, 1684 output_dims, 3); 1685 /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, 1686 output_dims, 2); 1687 /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, 1688 output_dims, 1); 1689 /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, 1690 output_dims, 0); 1691 TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims)); 1692 TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims)); 1693 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 1694 1695 int i = 0; 1696 const int size = input1_dims.sizes[3] * input1_dims.strides[3]; 1697#ifdef USE_NEON 1698 const auto activation_min = vdupq_n_f32(output_activation_min); 1699 const auto activation_max = vdupq_n_f32(output_activation_max); 1700 for (; i <= size - 16; i += 16) { 1701 auto a10 = vld1q_f32(input1_data + i); 1702 auto a11 = vld1q_f32(input1_data + i + 4); 1703 auto a12 = vld1q_f32(input1_data + i + 8); 1704 auto a13 = vld1q_f32(input1_data + i + 12); 1705 auto a20 = vld1q_f32(input2_data + i); 1706 auto a21 = vld1q_f32(input2_data + i + 4); 1707 auto a22 = vld1q_f32(input2_data + i + 8); 1708 auto a23 = vld1q_f32(input2_data + i + 12); 1709 auto x0 = vmulq_f32(a10, a20); 1710 auto x1 = vmulq_f32(a11, a21); 1711 auto x2 = vmulq_f32(a12, a22); 1712 auto x3 = vmulq_f32(a13, a23); 1713 1714 x0 = vmaxq_f32(activation_min, x0); 1715 x1 = vmaxq_f32(activation_min, x1); 1716 x2 = vmaxq_f32(activation_min, x2); 1717 x3 = vmaxq_f32(activation_min, x3); 1718 x0 = vminq_f32(activation_max, x0); 1719 x1 = vminq_f32(activation_max, x1); 1720 x2 = vminq_f32(activation_max, x2); 1721 x3 = vminq_f32(activation_max, x3); 1722 1723 vst1q_f32(output_data + i, x0); 1724 vst1q_f32(output_data + i + 4, x1); 1725 vst1q_f32(output_data + i + 8, x2); 1726 vst1q_f32(output_data + i + 12, x3); 1727 } 1728 for (; i <= size - 4; i += 4) { 1729 auto a1 = vld1q_f32(input1_data + i); 1730 auto a2 = vld1q_f32(input2_data + i); 1731 auto x = vmulq_f32(a1, a2); 1732 1733 x = vmaxq_f32(activation_min, x); 1734 x = vminq_f32(activation_max, x); 1735 1736 vst1q_f32(output_data + i, x); 1737 } 1738#endif // NEON 1739 1740 for (; i < size; i++) { 1741 auto x = input1_data[i] * input2_data[i]; 1742 output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min, 1743 output_activation_max); 1744 } 1745} 1746 1747// legacy, for compatibility with old checked-in code 1748template <FusedActivationFunctionType Ac> 1749void Mul(const float* input1_data, const Dims<4>& input1_dims, 1750 const float* input2_data, const Dims<4>& input2_dims, 1751 float* output_data, const Dims<4>& output_dims) { 1752 float output_activation_min, output_activation_max; 1753 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1754 1755 Mul(input1_data, input1_dims, input2_data, input2_dims, output_activation_min, 1756 output_activation_max, output_data, output_dims); 1757} 1758 1759template <FusedActivationFunctionType Ac> 1760void Mul(const int32* input1_data, const Dims<4>& input1_dims, 1761 const int32* input2_data, const Dims<4>& input2_dims, 1762 int32* output_data, const Dims<4>& output_dims) { 1763 gemmlowp::ScopedProfilingLabel label("Mul/int32"); 1764 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 1765 1766 auto input1_map = MapAsVector(input1_data, input1_dims); 1767 auto input2_map = MapAsVector(input2_data, input2_dims); 1768 auto output_map = MapAsVector(output_data, output_dims); 1769 if (AreSameDims(input1_dims, input2_dims)) { 1770 output_map.array() = input1_map.array() * input2_map.array(); 1771 } else if (RequiredBufferSizeForDims(input2_dims) == 1) { 1772 auto scalar = input2_data[0]; 1773 output_map.array() = input1_map.array() * scalar; 1774 } else if (RequiredBufferSizeForDims(input1_dims) == 1) { 1775 auto scalar = input1_data[0]; 1776 output_map.array() = scalar * input2_map.array(); 1777 } else { 1778 // Should not come here. 1779 TFLITE_DCHECK(false); 1780 } 1781} 1782 1783// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary 1784// dimensionality if the runtime code does a single loop over one dimension 1785// that handles broadcasting as the base case. The code generator would then 1786// generate max(D1, D2) nested for loops. 1787// TODO(benoitjacob): BroadcastMul is intentionally duplicated from 1788// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> 1789// is no longer referenced in this file, move NdArrayDesc<T> from types.h to 1790// reference_ops.h. 1791template <typename T> 1792void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, 1793 const T* input2_data, const Dims<4>& input2_dims, 1794 T output_activation_min, T output_activation_max, 1795 T* output_data, const Dims<4>& output_dims) { 1796 gemmlowp::ScopedProfilingLabel label("BroadcastMul"); 1797 1798 NdArrayDesc<4> desc1; 1799 NdArrayDesc<4> desc2; 1800 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1801 1802 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1803 // col, channel), with extents (batches, height, width, depth), with the 1804 // trailing dimension changing most rapidly (channels has the smallest stride, 1805 // typically 1 element). 1806 // 1807 // In generated C code, we store arrays with the dimensions reversed. The 1808 // first dimension has smallest stride. 1809 // 1810 // We name our variables by their Tensorflow convention, but generate C code 1811 // nesting loops such that the innermost loop has the smallest stride for the 1812 // best cache behavior. 1813 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1814 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1815 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1816 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1817 output_data[Offset(output_dims, c, x, y, b)] = 1818 ActivationFunctionWithMinMax( 1819 input1_data[SubscriptToIndex(desc1, c, x, y, b)] * 1820 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 1821 output_activation_min, output_activation_max); 1822 } 1823 } 1824 } 1825 } 1826} 1827 1828// legacy, for compatibility with old checked-in code 1829template <FusedActivationFunctionType Ac, typename T> 1830void BroadcastMul(const T* input1_data, const Dims<4>& input1_dims, 1831 const T* input2_data, const Dims<4>& input2_dims, 1832 T* output_data, const Dims<4>& output_dims) { 1833 T output_activation_min, output_activation_max; 1834 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 1835 1836 BroadcastMul(input1_data, input1_dims, input2_data, input2_dims, 1837 output_activation_min, output_activation_max, output_data, 1838 output_dims); 1839} 1840 1841inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1842 int32 input1_offset, const uint8* input2_data, 1843 const Dims<4>& input2_dims, int32 input2_offset, 1844 int32 output_offset, int32 output_multiplier, 1845 int output_shift, int32 output_activation_min, 1846 int32 output_activation_max, uint8* output_data, 1847 const Dims<4>& output_dims) { 1848 gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); 1849 1850 NdArrayDesc<4> desc1; 1851 NdArrayDesc<4> desc2; 1852 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1853 1854 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1855 // col, channel), with extents (batches, height, width, depth), with the 1856 // trailing dimension changing most rapidly (channels has the smallest stride, 1857 // typically 1 element). 1858 // 1859 // In generated C code, we store arrays with the dimensions reversed. The 1860 // first dimension has smallest stride. 1861 // 1862 // We name our variables by their Tensorflow convention, but generate C code 1863 // nesting loops such that the innermost loop has the smallest stride for the 1864 // best cache behavior. 1865 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1866 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1867 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1868 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1869 const int32 input1_val = 1870 input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; 1871 const int32 input2_val = 1872 input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 1873 const int32 unclamped_result = 1874 output_offset + 1875 MultiplyByQuantizedMultiplierSmallerThanOne( 1876 input1_val * input2_val, output_multiplier, output_shift); 1877 const int32 clamped_output = 1878 std::min(output_activation_max, 1879 std::max(output_activation_min, unclamped_result)); 1880 output_data[Offset(output_dims, c, x, y, b)] = 1881 static_cast<uint8>(clamped_output); 1882 } 1883 } 1884 } 1885 } 1886} 1887 1888// legacy, for compatibility with old checked-in code 1889template <FusedActivationFunctionType Ac> 1890inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, 1891 int32 input1_offset, const uint8* input2_data, 1892 const Dims<4>& input2_dims, int32 input2_offset, 1893 int32 output_offset, int32 output_multiplier, 1894 int output_shift, int32 output_activation_min, 1895 int32 output_activation_max, uint8* output_data, 1896 const Dims<4>& output_dims) { 1897 BroadcastMul(input1_data, input1_dims, input1_offset, input2_data, 1898 input2_dims, input2_offset, output_offset, output_multiplier, 1899 output_shift, output_activation_min, output_activation_max, 1900 output_data, output_dims); 1901} 1902 1903// TODO(aselle): This is not actually optimized yet. 1904inline void Div(const float* input1_data, const Dims<4>& input1_dims, 1905 const float* input2_data, const Dims<4>& input2_dims, 1906 float output_activation_min, float output_activation_max, 1907 float* output_data, const Dims<4>& output_dims) { 1908 const int batches = 1909 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1910 const int height = 1911 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1912 const int width = 1913 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1914 const int depth = 1915 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1916 for (int b = 0; b < batches; ++b) { 1917 for (int y = 0; y < height; ++y) { 1918 for (int x = 0; x < width; ++x) { 1919 for (int c = 0; c < depth; ++c) { 1920 output_data[Offset(output_dims, c, x, y, b)] = 1921 ActivationFunctionWithMinMax( 1922 input1_data[Offset(input1_dims, c, x, y, b)] / 1923 input2_data[Offset(input2_dims, c, x, y, b)], 1924 output_activation_min, output_activation_max); 1925 } 1926 } 1927 } 1928 } 1929} 1930 1931// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary 1932// dimensionality if the runtime code does a single loop over one dimension 1933// that handles broadcasting as the base case. The code generator would then 1934// generate max(D1, D2) nested for loops. 1935// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from 1936// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> 1937// is no longer referenced in this file, move NdArrayDesc<T> from types.h to 1938// reference_ops.h. 1939template <typename T> 1940void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims, 1941 const T* input2_data, const Dims<4>& input2_dims, 1942 T output_activation_min, T output_activation_max, 1943 T* output_data, const Dims<4>& output_dims) { 1944 gemmlowp::ScopedProfilingLabel label("BroadcastDiv"); 1945 1946 NdArrayDesc<4> desc1; 1947 NdArrayDesc<4> desc2; 1948 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 1949 1950 // In Tensorflow, the dimensions are canonically named (batch_number, row, 1951 // col, channel), with extents (batches, height, width, depth), with the 1952 // trailing dimension changing most rapidly (channels has the smallest stride, 1953 // typically 1 element). 1954 // 1955 // In generated C code, we store arrays with the dimensions reversed. The 1956 // first dimension has smallest stride. 1957 // 1958 // We name our variables by their Tensorflow convention, but generate C code 1959 // nesting loops such that the innermost loop has the smallest stride for the 1960 // best cache behavior. 1961 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 1962 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 1963 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 1964 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 1965 output_data[Offset(output_dims, c, x, y, b)] = 1966 ActivationFunctionWithMinMax( 1967 input1_data[SubscriptToIndex(desc1, c, x, y, b)] / 1968 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 1969 output_activation_min, output_activation_max); 1970 } 1971 } 1972 } 1973 } 1974} 1975 1976// TODO(aselle): This is not actually optimized yet. 1977inline void Sub(const float* input1_data, const Dims<4>& input1_dims, 1978 const float* input2_data, const Dims<4>& input2_dims, 1979 float output_activation_min, float output_activation_max, 1980 float* output_data, const Dims<4>& output_dims) { 1981 const int batches = 1982 MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3); 1983 const int height = 1984 MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2); 1985 const int width = 1986 MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1); 1987 const int depth = 1988 MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0); 1989 for (int b = 0; b < batches; ++b) { 1990 for (int y = 0; y < height; ++y) { 1991 for (int x = 0; x < width; ++x) { 1992 for (int c = 0; c < depth; ++c) { 1993 output_data[Offset(output_dims, c, x, y, b)] = 1994 ActivationFunctionWithMinMax( 1995 input1_data[Offset(input1_dims, c, x, y, b)] - 1996 input2_data[Offset(input2_dims, c, x, y, b)], 1997 output_activation_min, output_activation_max); 1998 } 1999 } 2000 } 2001 } 2002} 2003 2004// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary 2005// dimensionality if the runtime code does a single loop over one dimension 2006// that handles broadcasting as the base case. The code generator would then 2007// generate max(D1, D2) nested for loops. 2008// TODO(benoitjacob): BroadcastSub is intentionally duplicated from 2009// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> 2010// is no longer referenced in this file, move NdArrayDesc<T> from types.h to 2011// reference_ops.h. 2012template <typename T> 2013void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims, 2014 const T* input2_data, const Dims<4>& input2_dims, 2015 T output_activation_min, T output_activation_max, 2016 T* output_data, const Dims<4>& output_dims) { 2017 gemmlowp::ScopedProfilingLabel label("BroadcastSub"); 2018 2019 NdArrayDesc<4> desc1; 2020 NdArrayDesc<4> desc2; 2021 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 2022 2023 // In Tensorflow, the dimensions are canonically named (batch_number, row, 2024 // col, channel), with extents (batches, height, width, depth), with the 2025 // trailing dimension changing most rapidly (channels has the smallest stride, 2026 // typically 1 element). 2027 // 2028 // In generated C code, we store arrays with the dimensions reversed. The 2029 // first dimension has smallest stride. 2030 // 2031 // We name our variables by their Tensorflow convention, but generate C code 2032 // nesting loops such that the innermost loop has the smallest stride for the 2033 // best cache behavior. 2034 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 2035 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 2036 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 2037 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 2038 output_data[Offset(output_dims, c, x, y, b)] = 2039 ActivationFunctionWithMinMax( 2040 input1_data[SubscriptToIndex(desc1, c, x, y, b)] - 2041 input2_data[SubscriptToIndex(desc2, c, x, y, b)], 2042 output_activation_min, output_activation_max); 2043 } 2044 } 2045 } 2046 } 2047} 2048 2049template <FusedActivationFunctionType Ac, typename Scalar> 2050void Concatenation(int concat_dim, const Scalar* const* input_data, 2051 const Dims<4>* const* input_dims, int inputs_count, 2052 Scalar* output_data, const Dims<4>& output_dims) { 2053 gemmlowp::ScopedProfilingLabel label("Concatenation"); 2054 int concat_size = 0; 2055 for (int i = 0; i < inputs_count; i++) { 2056 for (int j = 0; j < 4; j++) { 2057 if (j != concat_dim) { 2058 MatchingArraySize(*input_dims[i], j, output_dims, j); 2059 } 2060 } 2061 concat_size += ArraySize(*input_dims[i], concat_dim); 2062 } 2063 TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); 2064 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 2065 // for now we dont have a model with a Concatenation 2066 // with fused activation function. 2067 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 2068 int outer_size = 1; 2069 for (int i = concat_dim + 1; i < 4; i++) { 2070 outer_size *= output_dims.sizes[i]; 2071 } 2072 Scalar* output_ptr = output_data; 2073 for (int k = 0; k < outer_size; k++) { 2074 for (int i = 0; i < inputs_count; ++i) { 2075 const int copy_size = 2076 input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; 2077 memcpy(output_ptr, input_data[i] + k * copy_size, 2078 copy_size * sizeof(Scalar)); 2079 output_ptr += copy_size; 2080 } 2081 } 2082} 2083 2084template <FusedActivationFunctionType Ac, typename Scalar> 2085void DepthConcatenation(const Scalar* const* input_data, 2086 const Dims<4>* const* input_dims, int inputs_count, 2087 Scalar* output_data, const Dims<4>& output_dims) { 2088 Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, 2089 output_data, output_dims); 2090} 2091 2092inline void LstmCell(const float* input_data, const Dims<4>& input_dims, 2093 const float* prev_activ_data, 2094 const Dims<4>& prev_activ_dims, const float* weights_data, 2095 const Dims<4>& weights_dims, const float* bias_data, 2096 const Dims<4>& bias_dims, const float* prev_state_data, 2097 const Dims<4>& prev_state_dims, float* output_state_data, 2098 const Dims<4>& output_state_dims, float* output_activ_data, 2099 const Dims<4>& output_activ_dims, float* concat_temp_data, 2100 const Dims<4>& concat_temp_dims, float* activ_temp_data, 2101 const Dims<4>& activ_temp_dims) { 2102 gemmlowp::ScopedProfilingLabel label("LstmCell"); 2103 MatchingArraySize( // batches 2104 input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims, 2105 3, output_activ_dims, 3); 2106 MatchingArraySize( // height 2107 input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims, 2108 2, output_activ_dims, 2); 2109 MatchingArraySize( // width 2110 input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims, 2111 1, output_activ_dims, 1); 2112 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 2113 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 2114 const int input_depth = ArraySize(input_dims, 0); 2115 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 2116 const int total_input_depth = prev_activ_depth + input_depth; 2117 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 2118 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 2119 1); 2120 const int intern_activ_depth = 2121 MatchingArraySize(weights_dims, 1, bias_dims, 0); 2122 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 2123 const int output_depth = 2124 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 2125 output_state_dims, 0, output_activ_dims, 0); 2126 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 2127 2128 // Concatenate prev_activ and input data together 2129 std::vector<float const*> concat_input_arrays_data; 2130 std::vector<Dims<4> const*> concat_input_arrays_dims; 2131 concat_input_arrays_data.push_back(input_data); 2132 concat_input_arrays_data.push_back(prev_activ_data); 2133 concat_input_arrays_dims.push_back(&input_dims); 2134 concat_input_arrays_dims.push_back(&prev_activ_dims); 2135 Concatenation<FusedActivationFunctionType::kNone, float>( 2136 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), 2137 concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); 2138 2139 // Fully connected 2140 FullyConnected<FusedActivationFunctionType::kNone>( 2141 concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, 2142 bias_dims, activ_temp_data, activ_temp_dims); 2143 2144 // Map raw arrays to Eigen arrays so we can use Eigen's optimized array 2145 // operations. 2146 ArrayMap<float> activ_temp_map = 2147 MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims); 2148 auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth, 2149 activ_temp_map.cols()); 2150 auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth, 2151 activ_temp_map.cols()); 2152 auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth, 2153 activ_temp_map.cols()); 2154 auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth, 2155 activ_temp_map.cols()); 2156 ArrayMap<const float> prev_state_map = 2157 MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims); 2158 ArrayMap<float> output_state_map = 2159 MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims); 2160 ArrayMap<float> output_activ_map = 2161 MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims); 2162 2163 // Combined memory state and final output calculation 2164 gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput"); 2165 output_state_map = 2166 input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * 2167 new_input_sm.tanh() + 2168 forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * 2169 prev_state_map; 2170 output_activ_map = 2171 output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * 2172 output_state_map.tanh(); 2173} 2174 2175#ifdef GEMMLOWP_NEON 2176// In the common case of batch size 1, a fully-connected node degenerates 2177// to a matrix*vector product. LSTM cells contain a fully-connected node; 2178// when quantized, this becomes a special type of GEMV operation where 2179// the output is 16bit-quantized, thus needs its own special path. 2180inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims, 2181 const uint8* weights_data, 2182 const Dims<4>& weights_dims, 2183 uint8 weights_zero_point, const int32* bias_data, 2184 const Dims<4>& bias_dims, int32 accum_multiplier, 2185 int accum_shift, int16* output_data, 2186 const Dims<4>& output_dims) { 2187 gemmlowp::ScopedProfilingLabel label("GEMVForLstmCell"); 2188 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 2189 TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims)); 2190 TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims)); 2191 TFLITE_DCHECK(IsPackedWithoutStrides(output_dims)); 2192 TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * 2193 ArraySize(output_dims, 3), 2194 1); 2195 const int input_size = input_dims.strides[3]; 2196 const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0); 2197 // This special fast path for quantized LSTM cells does not try to support 2198 // odd sizes that we haven't encountered in any LSTM cell, that would 2199 // require special code (that would go untested until any LSTM cell 2200 // exercises it). We just guard our assumptions about size evenness with 2201 // the following assertions. 2202 TFLITE_DCHECK(!(output_size % 4)); 2203 TFLITE_DCHECK(!(input_size % 8)); 2204 const int32* bias_ptr = bias_data; 2205 int16* output_ptr = output_data; 2206 for (int out = 0; out < output_size; out += 4) { 2207 int32x4_t acc_0 = vdupq_n_s32(0); 2208 int32x4_t acc_1 = vdupq_n_s32(0); 2209 int32x4_t acc_2 = vdupq_n_s32(0); 2210 int32x4_t acc_3 = vdupq_n_s32(0); 2211 const int16x8_t input_offset_vec = vdupq_n_s16(-128); 2212 const int16x8_t weights_offset_vec = vdupq_n_s16(-weights_zero_point); 2213 int in = 0; 2214 // Handle 16 levels of depth at a time. 2215 for (; in <= input_size - 16; in += 16) { 2216 const uint8x16_t input_val_u8 = vld1q_u8(input_data + in); 2217 const uint8* weights_ptr = weights_data + in + out * input_size; 2218 uint8x16_t weights_val_u8_0 = vld1q_u8(weights_ptr + 0 * input_size); 2219 uint8x16_t weights_val_u8_1 = vld1q_u8(weights_ptr + 1 * input_size); 2220 uint8x16_t weights_val_u8_2 = vld1q_u8(weights_ptr + 2 * input_size); 2221 uint8x16_t weights_val_u8_3 = vld1q_u8(weights_ptr + 3 * input_size); 2222 int16x8_t input_val_0, input_val_1; 2223 const uint8x8_t low = vget_low_u8(input_val_u8); 2224 const uint8x8_t high = vget_high_u8(input_val_u8); 2225 input_val_0 = vreinterpretq_s16_u16(vmovl_u8(low)); 2226 input_val_1 = vreinterpretq_s16_u16(vmovl_u8(high)); 2227 input_val_0 = vaddq_s16(input_val_0, input_offset_vec); 2228 input_val_1 = vaddq_s16(input_val_1, input_offset_vec); 2229 int16x8_t weights_val_0_0, weights_val_1_0, weights_val_2_0, 2230 weights_val_3_0; 2231 int16x8_t weights_val_0_1, weights_val_1_1, weights_val_2_1, 2232 weights_val_3_1; 2233 weights_val_0_0 = vaddq_s16( 2234 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_0))), 2235 weights_offset_vec); 2236 weights_val_0_1 = vaddq_s16( 2237 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_0))), 2238 weights_offset_vec); 2239 weights_val_1_0 = vaddq_s16( 2240 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_1))), 2241 weights_offset_vec); 2242 weights_val_1_1 = vaddq_s16( 2243 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_1))), 2244 weights_offset_vec); 2245 weights_val_2_0 = vaddq_s16( 2246 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_2))), 2247 weights_offset_vec); 2248 weights_val_2_1 = vaddq_s16( 2249 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_2))), 2250 weights_offset_vec); 2251 weights_val_3_0 = vaddq_s16( 2252 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(weights_val_u8_3))), 2253 weights_offset_vec); 2254 weights_val_3_1 = vaddq_s16( 2255 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(weights_val_u8_3))), 2256 weights_offset_vec); 2257 acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_0), 2258 vget_low_s16(input_val_0)); 2259 acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_0), 2260 vget_low_s16(input_val_0)); 2261 acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_0), 2262 vget_low_s16(input_val_0)); 2263 acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_0), 2264 vget_low_s16(input_val_0)); 2265 acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_0), 2266 vget_high_s16(input_val_0)); 2267 acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_0), 2268 vget_high_s16(input_val_0)); 2269 acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_0), 2270 vget_high_s16(input_val_0)); 2271 acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_0), 2272 vget_high_s16(input_val_0)); 2273 acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0_1), 2274 vget_low_s16(input_val_1)); 2275 acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1_1), 2276 vget_low_s16(input_val_1)); 2277 acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2_1), 2278 vget_low_s16(input_val_1)); 2279 acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3_1), 2280 vget_low_s16(input_val_1)); 2281 acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0_1), 2282 vget_high_s16(input_val_1)); 2283 acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1_1), 2284 vget_high_s16(input_val_1)); 2285 acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2_1), 2286 vget_high_s16(input_val_1)); 2287 acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3_1), 2288 vget_high_s16(input_val_1)); 2289 } 2290 // Handle 8 levels of depth at a time. 2291 for (; in < input_size; in += 8) { 2292 const uint8x8_t input_val_u8 = vld1_u8(input_data + in); 2293 const uint8* weights_ptr = weights_data + in + out * input_size; 2294 uint8x8_t weights_val_u8_0 = vld1_u8(weights_ptr + 0 * input_size); 2295 uint8x8_t weights_val_u8_1 = vld1_u8(weights_ptr + 1 * input_size); 2296 uint8x8_t weights_val_u8_2 = vld1_u8(weights_ptr + 2 * input_size); 2297 uint8x8_t weights_val_u8_3 = vld1_u8(weights_ptr + 3 * input_size); 2298 int16x8_t input_val; 2299 input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8)); 2300 input_val = vaddq_s16(input_val, input_offset_vec); 2301 int16x8_t weights_val_0, weights_val_1, weights_val_2, weights_val_3; 2302 weights_val_0 = 2303 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_0)), 2304 weights_offset_vec); 2305 weights_val_1 = 2306 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_1)), 2307 weights_offset_vec); 2308 weights_val_2 = 2309 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_2)), 2310 weights_offset_vec); 2311 weights_val_3 = 2312 vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(weights_val_u8_3)), 2313 weights_offset_vec); 2314 acc_0 = vmlal_s16(acc_0, vget_low_s16(weights_val_0), 2315 vget_low_s16(input_val)); 2316 acc_1 = vmlal_s16(acc_1, vget_low_s16(weights_val_1), 2317 vget_low_s16(input_val)); 2318 acc_2 = vmlal_s16(acc_2, vget_low_s16(weights_val_2), 2319 vget_low_s16(input_val)); 2320 acc_3 = vmlal_s16(acc_3, vget_low_s16(weights_val_3), 2321 vget_low_s16(input_val)); 2322 acc_0 = vmlal_s16(acc_0, vget_high_s16(weights_val_0), 2323 vget_high_s16(input_val)); 2324 acc_1 = vmlal_s16(acc_1, vget_high_s16(weights_val_1), 2325 vget_high_s16(input_val)); 2326 acc_2 = vmlal_s16(acc_2, vget_high_s16(weights_val_2), 2327 vget_high_s16(input_val)); 2328 acc_3 = vmlal_s16(acc_3, vget_high_s16(weights_val_3), 2329 vget_high_s16(input_val)); 2330 } 2331 // Horizontally reduce accumulators 2332 int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1, 2333 pairwise_reduced_acc_2, pairwise_reduced_acc_3; 2334 pairwise_reduced_acc_0 = 2335 vpadd_s32(vget_low_s32(acc_0), vget_high_s32(acc_0)); 2336 pairwise_reduced_acc_1 = 2337 vpadd_s32(vget_low_s32(acc_1), vget_high_s32(acc_1)); 2338 pairwise_reduced_acc_2 = 2339 vpadd_s32(vget_low_s32(acc_2), vget_high_s32(acc_2)); 2340 pairwise_reduced_acc_3 = 2341 vpadd_s32(vget_low_s32(acc_3), vget_high_s32(acc_3)); 2342 const int32x2_t reduced_lo = 2343 vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1); 2344 const int32x2_t reduced_hi = 2345 vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3); 2346 int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); 2347 // Add bias values. 2348 int32x4_t bias_vec = vld1q_s32(bias_ptr); 2349 bias_ptr += 4; 2350 reduced = vaddq_s32(reduced, bias_vec); 2351 int left_shift = accum_shift > 0 ? accum_shift : 0; 2352 int right_shift = accum_shift > 0 ? 0 : -accum_shift; 2353 reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift)); 2354 // Multiply by the fixed-point multiplier. 2355 reduced = vqrdmulhq_n_s32(reduced, accum_multiplier); 2356 // Rounding-shift-right. 2357 using gemmlowp::RoundingDivideByPOT; 2358 reduced = RoundingDivideByPOT(reduced, right_shift); 2359 // Narrow values down to 16 bit signed. 2360 const int16x4_t res16 = vqmovn_s32(reduced); 2361 vst1_s16(output_ptr, res16); 2362 output_ptr += 4; 2363 } 2364} 2365#endif 2366 2367// Quantized LSTM cell. Currently just a copy of the reference impl in 2368// reference_ops.h. See the big function comment there, not replicating it 2369// here. 2370template <int StateIntegerBits> 2371void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims, 2372 const uint8* prev_activ_data_uint8, 2373 const Dims<4>& prev_activ_dims, const uint8* weights_data_uint8, 2374 const Dims<4>& weights_dims, const int32* bias_data_int32, 2375 const Dims<4>& bias_dims, const int16* prev_state_data_int16, 2376 const Dims<4>& prev_state_dims, int16* output_state_data_int16, 2377 const Dims<4>& output_state_dims, uint8* output_activ_data_uint8, 2378 const Dims<4>& output_activ_dims, uint8* concat_temp_data_uint8, 2379 const Dims<4>& concat_temp_dims, int16* activ_temp_data_int16, 2380 const Dims<4>& activ_temp_dims, int32 weights_zero_point, 2381 int32 accum_multiplier, int accum_shift, 2382 gemmlowp::GemmContext* gemm_context) { 2383 gemmlowp::ScopedProfilingLabel label( 2384 "LstmCell/quantized (8bit external, 16bit internal)"); 2385 // Gather dimensions information, and perform consistency checks. 2386 const int batches = 2387 MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, 2388 output_state_dims, 3, output_activ_dims, 3); 2389 const int height = 2390 MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, 2391 output_state_dims, 2, output_activ_dims, 2); 2392 const int width = 2393 MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, 2394 output_state_dims, 1, output_activ_dims, 1); 2395 TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1); 2396 TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1); 2397 const int input_depth = ArraySize(input_dims, 0); 2398 const int prev_activ_depth = ArraySize(prev_activ_dims, 0); 2399 const int total_input_depth = prev_activ_depth + input_depth; 2400 TFLITE_CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); 2401 TFLITE_CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 2402 1); 2403 const int intern_activ_depth = 2404 MatchingArraySize(weights_dims, 1, bias_dims, 0); 2405 TFLITE_CHECK_EQ(intern_activ_depth % 4, 0); 2406 const int output_depth = 2407 MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0, 2408 output_state_dims, 0, output_activ_dims, 0); 2409 TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4); 2410 const int fc_batches = ArraySize(activ_temp_dims, 1) * 2411 ArraySize(activ_temp_dims, 2) * 2412 ArraySize(activ_temp_dims, 3); 2413 const int fc_output_depth = 2414 MatchingArraySize(weights_dims, 1, activ_temp_dims, 0); 2415 const int fc_accum_depth = ArraySize(weights_dims, 0); 2416 TFLITE_CHECK_EQ(fc_output_depth, 4 * output_depth); 2417 2418 // Depth-concatenate prev_activ and input data together. 2419 uint8 const* concat_input_arrays_data[2] = {input_data_uint8, 2420 prev_activ_data_uint8}; 2421 Dims<4> const* concat_input_arrays_dims[2] = {&input_dims, &prev_activ_dims}; 2422 Concatenation<FusedActivationFunctionType::kNone, uint8>( 2423 0, concat_input_arrays_data, concat_input_arrays_dims, 2, 2424 concat_temp_data_uint8, concat_temp_dims); 2425 2426 // Implementation of the fully connected node inside the LSTM cell. 2427 // The operands are 8-bit integers, the accumulators are internally 32bit 2428 // integers, and the output is 16-bit fixed-point with 3 integer bits so 2429 // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that 2430 // is explained in the function comment above. 2431 bool gemm_already_performed = false; 2432#ifdef GEMMLOWP_NEON 2433 if (fc_batches == 1 && !(fc_output_depth % 4) && !(fc_accum_depth % 8)) { 2434 GEMVForLstmCell(concat_temp_data_uint8, concat_temp_dims, 2435 weights_data_uint8, weights_dims, weights_zero_point, 2436 bias_data_int32, bias_dims, accum_multiplier, accum_shift, 2437 activ_temp_data_int16, activ_temp_dims); 2438 gemm_already_performed = true; 2439 } 2440#endif 2441 if (!gemm_already_performed) { 2442 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> 2443 weights_matrix(weights_data_uint8, fc_output_depth, fc_accum_depth); 2444 gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( 2445 concat_temp_data_uint8, fc_accum_depth, fc_batches); 2446 gemmlowp::MatrixMap<int16, gemmlowp::MapOrder::ColMajor> output_matrix( 2447 activ_temp_data_int16, fc_output_depth, fc_batches); 2448 typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> 2449 ColVectorMap; 2450 ColVectorMap bias_vector(bias_data_int32, fc_output_depth); 2451 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; 2452 bias_addition_stage.bias_vector = bias_vector; 2453 gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent scale_stage; 2454 scale_stage.result_offset_after_shift = 0; 2455 scale_stage.result_fixedpoint_multiplier = accum_multiplier; 2456 scale_stage.result_exponent = accum_shift; 2457 gemmlowp::OutputStageSaturatingCastToInt16 saturating_cast_int16_stage; 2458 auto output_pipeline = std::make_tuple(bias_addition_stage, scale_stage, 2459 saturating_cast_int16_stage); 2460 gemmlowp::GemmWithOutputPipeline< 2461 uint8, int16, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( 2462 gemm_context, weights_matrix, input_matrix, &output_matrix, 2463 -weights_zero_point, -128, output_pipeline); 2464 } 2465 2466 // Rest of the LSTM cell: tanh and logistic math functions, and some adds 2467 // and muls, all done in 16-bit fixed-point. 2468 const int outer_size = batches * width * height; 2469 const int16* input_gate_input_ptr = activ_temp_data_int16; 2470 const int16* input_modulation_gate_input_ptr = 2471 activ_temp_data_int16 + output_depth; 2472 const int16* forget_gate_input_ptr = activ_temp_data_int16 + 2 * output_depth; 2473 const int16* output_gate_input_ptr = activ_temp_data_int16 + 3 * output_depth; 2474 const int16* prev_state_ptr = prev_state_data_int16; 2475 int16* output_state_data_ptr = output_state_data_int16; 2476 uint8* output_activ_data_ptr = output_activ_data_uint8; 2477 2478 for (int b = 0; b < outer_size; ++b) { 2479 int c = 0; 2480#ifdef GEMMLOWP_NEON 2481 for (; c <= output_depth - 8; c += 8) { 2482 // Define the fixed-point data types that we will use here. All use 2483 // int16 as the underlying integer type i.e. all are 16-bit fixed-point. 2484 // They only differ by the number of integral vs. fractional bits, 2485 // determining the range of values that they can represent. 2486 // 2487 // F0 uses 0 integer bits, range [-1, 1]. 2488 // This is the return type of math functions such as tanh, logistic, 2489 // whose range is in [-1, 1]. 2490 using F0 = gemmlowp::FixedPoint<int16x8_t, 0>; 2491 // F3 uses 3 integer bits, range [-8, 8]. 2492 // This is the range of the previous fully-connected node's output, 2493 // which is our input here. 2494 using F3 = gemmlowp::FixedPoint<int16x8_t, 3>; 2495 // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits, 2496 // 2^StateIntegerBits]. It's used to represent the internal state, whose 2497 // number of integer bits is currently dictated by the model. See comment 2498 // on the StateIntegerBits template parameter above. 2499 using FS = gemmlowp::FixedPoint<int16x8_t, StateIntegerBits>; 2500 // Implementation of input gate, using fixed-point logistic function. 2501 F3 input_gate_input = F3::FromRaw(vld1q_s16(input_gate_input_ptr)); 2502 input_gate_input_ptr += 8; 2503 F0 input_gate_output = gemmlowp::logistic(input_gate_input); 2504 // Implementation of input modulation gate, using fixed-point tanh 2505 // function. 2506 F3 input_modulation_gate_input = 2507 F3::FromRaw(vld1q_s16(input_modulation_gate_input_ptr)); 2508 input_modulation_gate_input_ptr += 8; 2509 F0 input_modulation_gate_output = 2510 gemmlowp::tanh(input_modulation_gate_input); 2511 // Implementation of forget gate, using fixed-point logistic function. 2512 F3 forget_gate_input = F3::FromRaw(vld1q_s16(forget_gate_input_ptr)); 2513 forget_gate_input_ptr += 8; 2514 F0 forget_gate_output = gemmlowp::logistic(forget_gate_input); 2515 // Implementation of output gate, using fixed-point logistic function. 2516 F3 output_gate_input = F3::FromRaw(vld1q_s16(output_gate_input_ptr)); 2517 output_gate_input_ptr += 8; 2518 F0 output_gate_output = gemmlowp::logistic(output_gate_input); 2519 // Implementation of internal multiplication nodes, still in fixed-point. 2520 F0 input_times_input_modulation = 2521 input_gate_output * input_modulation_gate_output; 2522 FS prev_state = FS::FromRaw(vld1q_s16(prev_state_ptr)); 2523 prev_state_ptr += 8; 2524 FS prev_state_times_forget_state = forget_gate_output * prev_state; 2525 // Implementation of internal addition node, saturating. 2526 FS new_state = gemmlowp::SaturatingAdd( 2527 gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation), 2528 prev_state_times_forget_state); 2529 // Implementation of last internal tanh node, still in fixed-point. 2530 F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state); 2531 // Store the new internal state back to memory, as 16-bit integers. 2532 vst1q_s16(output_state_data_ptr, new_state.raw()); 2533 output_state_data_ptr += 8; 2534 // Down-scale the output activations to 8-bit integers, saturating, 2535 // and store back to memory. 2536 int16x8_t rescaled_output_activ = 2537 gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); 2538 int8x8_t int8_output_activ = vqmovn_s16(rescaled_output_activ); 2539 uint8x8_t uint8_output_activ = 2540 vadd_u8(vdup_n_u8(128), vreinterpret_u8_s8(int8_output_activ)); 2541 vst1_u8(output_activ_data_ptr, uint8_output_activ); 2542 output_activ_data_ptr += 8; 2543 } 2544#endif 2545 for (; c < output_depth; ++c) { 2546 // Define the fixed-point data types that we will use here. All use 2547 // int16 as the underlying integer type i.e. all are 16-bit fixed-point. 2548 // They only differ by the number of integral vs. fractional bits, 2549 // determining the range of values that they can represent. 2550 // 2551 // F0 uses 0 integer bits, range [-1, 1]. 2552 // This is the return type of math functions such as tanh, logistic, 2553 // whose range is in [-1, 1]. 2554 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>; 2555 // F3 uses 3 integer bits, range [-8, 8]. 2556 // This is the range of the previous fully-connected node's output, 2557 // which is our input here. 2558 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>; 2559 // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits, 2560 // 2^StateIntegerBits]. It's used to represent the internal state, whose 2561 // number of integer bits is currently dictated by the model. See comment 2562 // on the StateIntegerBits template parameter above. 2563 using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>; 2564 // Implementation of input gate, using fixed-point logistic function. 2565 F3 input_gate_input = F3::FromRaw(*input_gate_input_ptr++); 2566 F0 input_gate_output = gemmlowp::logistic(input_gate_input); 2567 // Implementation of input modulation gate, using fixed-point tanh 2568 // function. 2569 F3 input_modulation_gate_input = 2570 F3::FromRaw(*input_modulation_gate_input_ptr++); 2571 F0 input_modulation_gate_output = 2572 gemmlowp::tanh(input_modulation_gate_input); 2573 // Implementation of forget gate, using fixed-point logistic function. 2574 F3 forget_gate_input = F3::FromRaw(*forget_gate_input_ptr++); 2575 F0 forget_gate_output = gemmlowp::logistic(forget_gate_input); 2576 // Implementation of output gate, using fixed-point logistic function. 2577 F3 output_gate_input = F3::FromRaw(*output_gate_input_ptr++); 2578 F0 output_gate_output = gemmlowp::logistic(output_gate_input); 2579 // Implementation of internal multiplication nodes, still in fixed-point. 2580 F0 input_times_input_modulation = 2581 input_gate_output * input_modulation_gate_output; 2582 FS prev_state = FS::FromRaw(*prev_state_ptr++); 2583 FS prev_state_times_forget_state = forget_gate_output * prev_state; 2584 // Implementation of internal addition node, saturating. 2585 FS new_state = gemmlowp::SaturatingAdd( 2586 gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation), 2587 prev_state_times_forget_state); 2588 // Implementation of last internal tanh node, still in fixed-point. 2589 F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state); 2590 // Store the new internal state back to memory, as 16-bit integers. 2591 *output_state_data_ptr++ = new_state.raw(); 2592 // Down-scale the output activations to 8-bit integers, saturating, 2593 // and store back to memory. 2594 int16 rescaled_output_activ = 2595 gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8); 2596 int16 clamped_output_activ = 2597 std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ)); 2598 *output_activ_data_ptr++ = 128 + clamped_output_activ; 2599 } 2600 input_gate_input_ptr += 3 * output_depth; 2601 input_modulation_gate_input_ptr += 3 * output_depth; 2602 forget_gate_input_ptr += 3 * output_depth; 2603 output_gate_input_ptr += 3 * output_depth; 2604 } 2605} 2606 2607template <FusedActivationFunctionType Ac, typename Scalar> 2608void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, 2609 int outputs_count, Scalar* const* output_data, 2610 const Dims<4>* const* output_dims) { 2611 gemmlowp::ScopedProfilingLabel label("TensorFlowSplit"); 2612 TFLITE_DCHECK_GE(outputs_count, 1); 2613 for (int i = 0; i < outputs_count; i++) { 2614 /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); 2615 /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); 2616 /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); 2617 } 2618 const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3); 2619 const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2); 2620 const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1); 2621 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 2622 // for now we dont have a model with a TensorFlowSplit 2623 // with fused activation function. 2624 TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone); 2625 const int whb = width * height * batches; 2626 const Scalar* input_ptr = input_data; 2627 for (int k = 0; k < whb; k++) { 2628 for (int i = 0; i < outputs_count; ++i) { 2629 memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr, 2630 output_dims[i]->sizes[0] * sizeof(Scalar)); 2631 input_ptr += output_dims[i]->sizes[0]; 2632 } 2633 } 2634} 2635 2636inline int NodeOffset(int b, int h, int w, int height, int width) { 2637 return (b * height + h) * width + w; 2638} 2639 2640inline void AveragePool(const float* input_data, const Dims<4>& input_dims, 2641 int stride_width, int stride_height, int pad_width, 2642 int pad_height, int kwidth, int kheight, 2643 float output_activation_min, 2644 float output_activation_max, float* output_data, 2645 const Dims<4>& output_dims) { 2646 gemmlowp::ScopedProfilingLabel label("AveragePool"); 2647 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2648 const int input_height = ArraySize(input_dims, 2); 2649 const int input_width = ArraySize(input_dims, 1); 2650 const int output_height = ArraySize(output_dims, 2); 2651 const int output_width = ArraySize(output_dims, 1); 2652 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2653 2654 // TODO(benoitjacob) make this a proper reference impl without Eigen! 2655 const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 2656 auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 2657 // TODO(benoitjacob) get rid of the dynamic memory allocation here! 2658 Eigen::VectorXf out_count(out_mat.cols()); 2659 out_count.setZero(); 2660 // Prefill the output to 0. 2661 out_mat.setZero(); 2662 for (int b = 0; b < batches; ++b) { 2663 for (int h = 0; h < input_height; ++h) { 2664 for (int w = 0; w < input_width; ++w) { 2665 // (h_start, h_end) * (w_start, w_end) is the range that the input 2666 // vector projects to. 2667 int hpad = h + pad_height; 2668 int wpad = w + pad_width; 2669 int h_start = 2670 (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; 2671 int h_end = std::min(hpad / stride_height + 1, output_height); 2672 int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; 2673 int w_end = std::min(wpad / stride_width + 1, output_width); 2674 // compute elementwise sum 2675 for (int ph = h_start; ph < h_end; ++ph) { 2676 for (int pw = w_start; pw < w_end; ++pw) { 2677 int out_offset = NodeOffset(b, ph, pw, output_height, output_width); 2678 out_mat.col(out_offset) += 2679 in_mat.col(NodeOffset(b, h, w, input_height, input_width)); 2680 out_count(out_offset)++; 2681 } 2682 } 2683 } 2684 } 2685 } 2686 // Divide the output by the actual number of elements being averaged over 2687 TFLITE_DCHECK_GT(out_count.minCoeff(), 0); 2688 out_mat.array().rowwise() /= out_count.transpose().array(); 2689 2690 for (int b = 0; b < batches; ++b) { 2691 for (int y = 0; y < output_height; ++y) { 2692 for (int x = 0; x < output_width; ++x) { 2693 for (int c = 0; c < depth; ++c) { 2694 output_data[Offset(output_dims, c, x, y, b)] = 2695 ActivationFunctionWithMinMax( 2696 output_data[Offset(output_dims, c, x, y, b)], 2697 output_activation_min, output_activation_max); 2698 } 2699 } 2700 } 2701 } 2702} 2703 2704// legacy, for compatibility with old checked-in code 2705template <FusedActivationFunctionType Ac> 2706void AveragePool(const float* input_data, const Dims<4>& input_dims, 2707 int stride_width, int stride_height, int pad_width, 2708 int pad_height, int kwidth, int kheight, float* output_data, 2709 const Dims<4>& output_dims) { 2710 float output_activation_min, output_activation_max; 2711 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 2712 2713 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 2714 pad_height, kwidth, kheight, output_activation_min, 2715 output_activation_max, output_data, output_dims); 2716} 2717 2718// legacy, for compatibility with old checked-in code 2719template <FusedActivationFunctionType Ac> 2720void AveragePool(const float* input_data, const Dims<4>& input_dims, int stride, 2721 int pad_width, int pad_height, int filter_width, 2722 int filter_height, float* output_data, 2723 const Dims<4>& output_dims) { 2724 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 2725 filter_width, filter_height, output_data, output_dims); 2726} 2727 2728inline void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 2729 int stride_width, int stride_height, int pad_width, 2730 int pad_height, int filter_width, int filter_height, 2731 int32 output_activation_min, 2732 int32 output_activation_max, uint8* output_data, 2733 const Dims<4>& output_dims) { 2734 gemmlowp::ScopedProfilingLabel label("AveragePool/8bit"); 2735 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 2736 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2737 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2738 const int input_height = ArraySize(input_dims, 2); 2739 const int input_width = ArraySize(input_dims, 1); 2740 const int output_height = ArraySize(output_dims, 2); 2741 const int output_width = ArraySize(output_dims, 1); 2742 for (int batch = 0; batch < batches; ++batch) { 2743 for (int out_y = 0; out_y < output_height; ++out_y) { 2744 for (int out_x = 0; out_x < output_width; ++out_x) { 2745 const int in_x_origin = (out_x * stride_width) - pad_width; 2746 const int in_y_origin = (out_y * stride_height) - pad_height; 2747 const int filter_x_start = std::max(0, -in_x_origin); 2748 const int filter_x_end = 2749 std::min(filter_width, input_width - in_x_origin); 2750 const int filter_y_start = std::max(0, -in_y_origin); 2751 const int filter_y_end = 2752 std::min(filter_height, input_height - in_y_origin); 2753 const int filter_count = 2754 (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); 2755 // 1280 required by Inception v3 2756 static constexpr int kAccBufferMaxSize = 2048; 2757 TFLITE_DCHECK_LE(depth, kAccBufferMaxSize); 2758 uint16 acc[kAccBufferMaxSize]; 2759 memset(acc, 0, depth * sizeof(acc[0])); 2760 const uint8* input_ptr = 2761 input_data + input_dims.strides[1] * in_x_origin + 2762 input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; 2763 for (int fy = filter_y_start; fy < filter_y_end; fy++) { 2764 const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + 2765 filter_x_start * input_dims.strides[1]; 2766 for (int fx = filter_x_start; fx < filter_x_end; fx++) { 2767 int channel = 0; 2768#ifdef USE_NEON 2769 for (; channel <= depth - 16; channel += 16) { 2770 uint16x8_t acc_reg[2]; 2771 for (int i = 0; i < 2; i++) { 2772 acc_reg[i] = vld1q_u16(acc + channel + 8 * i); 2773 } 2774 uint8x16_t input_reg = vld1q_u8(input_row_ptr); 2775 input_row_ptr += 16; 2776 acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg)); 2777 acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg)); 2778 for (int i = 0; i < 2; i++) { 2779 vst1q_u16(acc + channel + 8 * i, acc_reg[i]); 2780 } 2781 } 2782 for (; channel <= depth - 8; channel += 8) { 2783 uint16x8_t acc_reg = vld1q_u16(acc + channel); 2784 uint8x8_t input_reg = vld1_u8(input_row_ptr); 2785 input_row_ptr += 8; 2786 acc_reg = vaddw_u8(acc_reg, input_reg); 2787 vst1q_u16(acc + channel, acc_reg); 2788 } 2789#endif 2790 for (; channel < depth; ++channel) { 2791 acc[channel] += *input_row_ptr++; 2792 } 2793 } 2794 } 2795 uint8* output_ptr = 2796 output_data + Offset(output_dims, 0, out_x, out_y, batch); 2797 int channel = 0; 2798#ifdef USE_NEON 2799#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ 2800 if (filter_count == FILTER_COUNT) { \ 2801 for (; channel <= depth - 8; channel += 8) { \ 2802 uint16 buf[8]; \ 2803 for (int i = 0; i < 8; i++) { \ 2804 buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ 2805 } \ 2806 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ 2807 buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ 2808 buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ 2809 vst1_u8(output_ptr + channel, buf8); \ 2810 } \ 2811 } 2812 AVGPOOL_DIVIDING_BY(9) 2813 AVGPOOL_DIVIDING_BY(15) 2814#undef AVGPOOL_DIVIDING_BY 2815 for (; channel <= depth - 8; channel += 8) { 2816 uint16 buf[8]; 2817 for (int i = 0; i < 8; i++) { 2818 buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; 2819 } 2820 uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); 2821 buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); 2822 buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); 2823 vst1_u8(output_ptr + channel, buf8); 2824 } 2825#endif 2826 for (; channel < depth; ++channel) { 2827 uint16 a = (acc[channel] + filter_count / 2) / filter_count; 2828 a = std::max<uint16>(a, output_activation_min); 2829 a = std::min<uint16>(a, output_activation_max); 2830 output_ptr[channel] = static_cast<uint8>(a); 2831 } 2832 } 2833 } 2834 } 2835} 2836 2837// legacy, for compatibility with old checked-in code 2838template <FusedActivationFunctionType Ac> 2839void AveragePool(const uint8* input_data, const Dims<4>& input_dims, 2840 int stride_width, int stride_height, int pad_width, 2841 int pad_height, int filter_width, int filter_height, 2842 int32 output_activation_min, int32 output_activation_max, 2843 uint8* output_data, const Dims<4>& output_dims) { 2844 static_assert(Ac == FusedActivationFunctionType::kNone || 2845 Ac == FusedActivationFunctionType::kRelu || 2846 Ac == FusedActivationFunctionType::kRelu6 || 2847 Ac == FusedActivationFunctionType::kRelu1, 2848 ""); 2849 if (Ac == FusedActivationFunctionType::kNone) { 2850 TFLITE_DCHECK_EQ(output_activation_min, 0); 2851 TFLITE_DCHECK_EQ(output_activation_max, 255); 2852 } 2853 AveragePool(input_data, input_dims, stride_width, stride_height, pad_width, 2854 pad_height, filter_width, filter_height, output_activation_min, 2855 output_activation_max, output_data, output_dims); 2856} 2857 2858// legacy, for compatibility with old checked-in code 2859template <FusedActivationFunctionType Ac> 2860void AveragePool(const uint8* input_data, const Dims<4>& input_dims, int stride, 2861 int pad_width, int pad_height, int filter_width, 2862 int filter_height, int32 output_activation_min, 2863 int32 output_activation_max, uint8* output_data, 2864 const Dims<4>& output_dims) { 2865 AveragePool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 2866 filter_width, filter_height, output_activation_min, 2867 output_activation_max, output_data, output_dims); 2868} 2869 2870inline void MaxPool(const float* input_data, const Dims<4>& input_dims, 2871 int stride_width, int stride_height, int pad_width, 2872 int pad_height, int kwidth, int kheight, 2873 float output_activation_min, float output_activation_max, 2874 float* output_data, const Dims<4>& output_dims) { 2875 gemmlowp::ScopedProfilingLabel label("MaxPool"); 2876 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2877 const int input_height = ArraySize(input_dims, 2); 2878 const int input_width = ArraySize(input_dims, 1); 2879 const int output_height = ArraySize(output_dims, 2); 2880 const int output_width = ArraySize(output_dims, 1); 2881 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2882 2883 const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 2884 auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 2885 // Prefill the output to minimum representable float value 2886 out_mat.setConstant(std::numeric_limits<float>::lowest()); 2887 for (int b = 0; b < batches; ++b) { 2888 for (int h = 0; h < input_height; ++h) { 2889 for (int w = 0; w < input_width; ++w) { 2890 // (h_start, h_end) * (w_start, w_end) is the range that the input 2891 // vector projects to. 2892 int hpad = h + pad_height; 2893 int wpad = w + pad_width; 2894 int h_start = 2895 (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; 2896 int h_end = std::min(hpad / stride_height + 1, output_height); 2897 int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; 2898 int w_end = std::min(wpad / stride_width + 1, output_width); 2899 // compute elementwise sum 2900 for (int ph = h_start; ph < h_end; ++ph) { 2901 for (int pw = w_start; pw < w_end; ++pw) { 2902 int out_offset = NodeOffset(b, ph, pw, output_height, output_width); 2903 out_mat.col(out_offset) = 2904 out_mat.col(out_offset) 2905 .cwiseMax(in_mat.col( 2906 NodeOffset(b, h, w, input_height, input_width))); 2907 } 2908 } 2909 } 2910 } 2911 } 2912 2913 for (int b = 0; b < batches; ++b) { 2914 for (int y = 0; y < output_height; ++y) { 2915 for (int x = 0; x < output_width; ++x) { 2916 for (int c = 0; c < depth; ++c) { 2917 output_data[Offset(output_dims, c, x, y, b)] = 2918 ActivationFunctionWithMinMax( 2919 output_data[Offset(output_dims, c, x, y, b)], 2920 output_activation_min, output_activation_max); 2921 } 2922 } 2923 } 2924 } 2925} 2926 2927// legacy, for compatibility with old checked-in code 2928template <FusedActivationFunctionType Ac> 2929void MaxPool(const float* input_data, const Dims<4>& input_dims, 2930 int stride_width, int stride_height, int pad_width, int pad_height, 2931 int kwidth, int kheight, float* output_data, 2932 const Dims<4>& output_dims) { 2933 float output_activation_min, output_activation_max; 2934 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 2935 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 2936 pad_height, kwidth, kheight, output_activation_min, 2937 output_activation_max, output_data, output_dims); 2938} 2939 2940// legacy, for compatibility with old checked-in code 2941template <FusedActivationFunctionType Ac> 2942void MaxPool(const float* input_data, const Dims<4>& input_dims, int stride, 2943 int pad_width, int pad_height, int filter_width, int filter_height, 2944 float* output_data, const Dims<4>& output_dims) { 2945 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 2946 filter_width, filter_height, output_data, output_dims); 2947} 2948 2949inline void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 2950 int stride_width, int stride_height, int pad_width, 2951 int pad_height, int filter_width, int filter_height, 2952 int32 output_activation_min, int32 output_activation_max, 2953 uint8* output_data, const Dims<4>& output_dims) { 2954 gemmlowp::ScopedProfilingLabel label("MaxPool/8bit"); 2955 TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 2956 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 2957 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 2958 const int input_height = ArraySize(input_dims, 2); 2959 const int input_width = ArraySize(input_dims, 1); 2960 const int output_height = ArraySize(output_dims, 2); 2961 const int output_width = ArraySize(output_dims, 1); 2962 for (int batch = 0; batch < batches; ++batch) { 2963 for (int out_y = 0; out_y < output_height; ++out_y) { 2964 for (int out_x = 0; out_x < output_width; ++out_x) { 2965 const int in_x_origin = (out_x * stride_width) - pad_width; 2966 const int in_y_origin = (out_y * stride_height) - pad_height; 2967 const int filter_x_start = std::max(0, -in_x_origin); 2968 const int filter_x_end = 2969 std::min(filter_width, input_width - in_x_origin); 2970 const int filter_y_start = std::max(0, -in_y_origin); 2971 const int filter_y_end = 2972 std::min(filter_height, input_height - in_y_origin); 2973 // 2048 required by Inception v3 2974 static constexpr int kAccBufferMaxSize = 2048; 2975 TFLITE_DCHECK_LE(depth, kAccBufferMaxSize); 2976 uint8 acc[kAccBufferMaxSize]; 2977 memset(acc, 0, depth * sizeof(acc[0])); 2978 const uint8* input_ptr = 2979 input_data + input_dims.strides[1] * in_x_origin + 2980 input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; 2981 for (int fy = filter_y_start; fy < filter_y_end; fy++) { 2982 const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + 2983 filter_x_start * input_dims.strides[1]; 2984 for (int fx = filter_x_start; fx < filter_x_end; fx++) { 2985 int channel = 0; 2986#ifdef USE_NEON 2987 for (; channel <= depth - 16; channel += 16) { 2988 uint8x16_t acc_reg = vld1q_u8(acc + channel); 2989 uint8x16_t input_reg = vld1q_u8(input_row_ptr); 2990 input_row_ptr += 16; 2991 acc_reg = vmaxq_u8(acc_reg, input_reg); 2992 vst1q_u8(acc + channel, acc_reg); 2993 } 2994 2995 for (; channel <= depth - 8; channel += 8) { 2996 uint8x8_t acc_reg = vld1_u8(acc + channel); 2997 uint8x8_t input_reg = vld1_u8(input_row_ptr); 2998 input_row_ptr += 8; 2999 acc_reg = vmax_u8(acc_reg, input_reg); 3000 vst1_u8(acc + channel, acc_reg); 3001 } 3002#endif 3003 for (; channel < depth; ++channel) { 3004 acc[channel] = std::max(acc[channel], *input_row_ptr++); 3005 } 3006 } 3007 } 3008 uint8* output_ptr = 3009 output_data + Offset(output_dims, 0, out_x, out_y, batch); 3010 int channel = 0; 3011#ifdef USE_NEON 3012 for (; channel <= depth - 16; channel += 16) { 3013 uint8x16_t a = vld1q_u8(acc + channel); 3014 a = vminq_u8(a, vdupq_n_u8(output_activation_max)); 3015 a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); 3016 vst1q_u8(output_ptr + channel, a); 3017 } 3018 for (; channel <= depth - 8; channel += 8) { 3019 uint8x8_t a = vld1_u8(acc + channel); 3020 a = vmin_u8(a, vdup_n_u8(output_activation_max)); 3021 a = vmax_u8(a, vdup_n_u8(output_activation_min)); 3022 vst1_u8(output_ptr + channel, a); 3023 } 3024#endif 3025 for (; channel < depth; ++channel) { 3026 uint8 a = acc[channel]; 3027 a = std::max<uint8>(a, output_activation_min); 3028 a = std::min<uint8>(a, output_activation_max); 3029 output_ptr[channel] = static_cast<uint8>(a); 3030 } 3031 } 3032 } 3033 } 3034} 3035 3036// legacy, for compatibility with old checked-in code 3037template <FusedActivationFunctionType Ac> 3038void MaxPool(const uint8* input_data, const Dims<4>& input_dims, 3039 int stride_width, int stride_height, int pad_width, int pad_height, 3040 int filter_width, int filter_height, int32 output_activation_min, 3041 int32 output_activation_max, uint8* output_data, 3042 const Dims<4>& output_dims) { 3043 static_assert(Ac == FusedActivationFunctionType::kNone || 3044 Ac == FusedActivationFunctionType::kRelu || 3045 Ac == FusedActivationFunctionType::kRelu6 || 3046 Ac == FusedActivationFunctionType::kRelu1, 3047 ""); 3048 if (Ac == FusedActivationFunctionType::kNone) { 3049 TFLITE_DCHECK_EQ(output_activation_min, 0); 3050 TFLITE_DCHECK_EQ(output_activation_max, 255); 3051 } 3052 MaxPool(input_data, input_dims, stride_width, stride_height, pad_width, 3053 pad_height, filter_width, filter_height, output_activation_min, 3054 output_activation_max, output_data, output_dims); 3055} 3056 3057// legacy, for compatibility with old checked-in code 3058template <FusedActivationFunctionType Ac> 3059void MaxPool(const uint8* input_data, const Dims<4>& input_dims, int stride, 3060 int pad_width, int pad_height, int filter_width, int filter_height, 3061 int32 output_activation_min, int32 output_activation_max, 3062 uint8* output_data, const Dims<4>& output_dims) { 3063 MaxPool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 3064 filter_width, filter_height, output_activation_min, 3065 output_activation_max, output_data, output_dims); 3066} 3067 3068inline void L2Pool(const float* input_data, const Dims<4>& input_dims, 3069 int stride_width, int stride_height, int pad_width, 3070 int pad_height, int filter_width, int filter_height, 3071 float output_activation_min, float output_activation_max, 3072 float* output_data, const Dims<4>& output_dims) { 3073 gemmlowp::ScopedProfilingLabel label("L2Pool"); 3074 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 3075 const int input_height = ArraySize(input_dims, 2); 3076 const int input_width = ArraySize(input_dims, 1); 3077 const int output_height = ArraySize(output_dims, 2); 3078 const int output_width = ArraySize(output_dims, 1); 3079 // Actually carry out L2 Pool. Code is written in forward mode: we go through 3080 // the input values once, and write to all the pooled regions that it maps to. 3081 const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 3082 auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 3083 Eigen::VectorXf in_square(in_mat.rows()); 3084 Eigen::VectorXf out_count(out_mat.cols()); 3085 out_count.setZero(); 3086 // Prefill the output to 0. 3087 out_mat.setZero(); 3088 for (int b = 0; b < batches; ++b) { 3089 for (int h = 0; h < input_height; ++h) { 3090 for (int w = 0; w < input_width; ++w) { 3091 // (h_start, h_end) * (w_start, w_end) is the range that the input 3092 // vector projects to. 3093 const int hpad = h + pad_height; 3094 const int wpad = w + pad_width; 3095 const int h_start = (hpad < filter_height) 3096 ? 0 3097 : (hpad - filter_height) / stride_height + 1; 3098 const int h_end = std::min(hpad / stride_height + 1, output_height); 3099 const int w_start = (wpad < filter_width) 3100 ? 0 3101 : (wpad - filter_width) / stride_width + 1; 3102 const int w_end = std::min(wpad / stride_width + 1, output_width); 3103 // pre-compute square 3104 const int in_offset = w + input_width * (h + input_height * b); 3105 in_square = 3106 in_mat.col(in_offset).array() * in_mat.col(in_offset).array(); 3107 // compute elementwise sum of squares 3108 for (int ph = h_start; ph < h_end; ++ph) { 3109 for (int pw = w_start; pw < w_end; ++pw) { 3110 const int out_offset = pw + output_width * (ph + output_height * b); 3111 out_mat.col(out_offset) += in_square; 3112 out_count(out_offset)++; 3113 } 3114 } 3115 } 3116 } 3117 } 3118 3119 out_count = out_count.array().inverse(); 3120 out_mat = 3121 (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt(); 3122} 3123 3124// legacy, for compatibility with old checked-in code 3125template <FusedActivationFunctionType Ac> 3126void L2Pool(const float* input_data, const Dims<4>& input_dims, 3127 int stride_width, int stride_height, int pad_width, int pad_height, 3128 int filter_width, int filter_height, float* output_data, 3129 const Dims<4>& output_dims) { 3130 float output_activation_min, output_activation_max; 3131 GetActivationMinMax(Ac, &output_activation_min, &output_activation_max); 3132 L2Pool(input_data, input_dims, stride_width, stride_height, pad_width, 3133 pad_height, filter_width, filter_height, output_activation_min, 3134 output_activation_max, output_data, output_dims); 3135} 3136 3137// legacy, for compatibility with old checked-in code 3138template <FusedActivationFunctionType Ac> 3139void L2Pool(const float* input_data, const Dims<4>& input_dims, int stride, 3140 int pad_width, int pad_height, int filter_width, int filter_height, 3141 float* output_data, const Dims<4>& output_dims) { 3142 L2Pool<Ac>(input_data, input_dims, stride, stride, pad_width, pad_height, 3143 filter_width, filter_height, output_data, output_dims); 3144} 3145 3146inline void LocalResponseNormalization(const float* input_data, 3147 const Dims<4>& input_dims, int range, 3148 float bias, float alpha, float beta, 3149 float* output_data, 3150 const Dims<4>& output_dims) { 3151 gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization"); 3152 /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); 3153 /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); 3154 /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); 3155 /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); 3156 3157 const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 3158 auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 3159 3160 // Carry out local response normalization, vector by vector. 3161 // Since the data are stored column major, making row-wise operation 3162 // probably not memory efficient anyway, we do an explicit for loop over 3163 // the columns. 3164 const int double_range = range * 2; 3165 Eigen::VectorXf padded_square(data_in.rows() + double_range); 3166 padded_square.setZero(); 3167 for (int r = 0; r < data_in.cols(); ++r) { 3168 // Do local response normalization for data_in(:, r) 3169 // first, compute the square and store them in buffer for repeated use 3170 padded_square.block(range, 0, data_in.rows(), 1) = 3171 data_in.col(r).cwiseProduct(data_in.col(r)) * alpha; 3172 // Then, compute the scale and writes them to data_out 3173 float accumulated_scale = 0; 3174 for (int i = 0; i < double_range; ++i) { 3175 accumulated_scale += padded_square(i); 3176 } 3177 for (int i = 0; i < data_in.rows(); ++i) { 3178 accumulated_scale += padded_square(i + double_range); 3179 data_out(i, r) = bias + accumulated_scale; 3180 accumulated_scale -= padded_square(i); 3181 } 3182 } 3183 3184 // In a few cases, the pow computation could benefit from speedups. 3185 if (beta == 1) { 3186 data_out.array() = data_in.array() * data_out.array().inverse(); 3187 } else if (beta == 0.5) { 3188 data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); 3189 } else { 3190 data_out.array() = data_in.array() * data_out.array().pow(-beta); 3191 } 3192} 3193 3194inline void Softmax(const float* input_data, const Dims<4>& input_dims, 3195 float beta, float* output_data, 3196 const Dims<4>& output_dims) { 3197 gemmlowp::ScopedProfilingLabel label("Softmax"); 3198 /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); 3199 /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); 3200 /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); 3201 /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); 3202 3203 const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); 3204 auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); 3205 // Compute the exponential first, removing the max coefficient for numerical 3206 // stability. 3207 out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta; 3208 // We are separating out the exp function so that exp can be vectorized. 3209 out_mat = out_mat.array().exp(); 3210 // Normalize to get the activations. 3211 Eigen::Array<float, 1, Eigen::Dynamic> scale = 3212 out_mat.array().colwise().sum().inverse(); 3213 out_mat.array().rowwise() *= scale; 3214} 3215 3216inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, 3217 int32 input_beta_multiplier, int32 input_beta_left_shift, 3218 int diff_min, uint8* output_data, 3219 const Dims<4>& output_dims) { 3220 // The representation chosen for the input to the exp() function is Q5.26. 3221 // We need to leave extra space since values that we skip might be as large as 3222 // -32 before multiplying by input_beta_multiplier, and therefore as large as 3223 // -16 afterwards. Note that exp(-8) is definitely not insignificant to 3224 // accumulation, but exp(-16) definitely is. 3225 static const int kScaledDiffIntegerBits = 5; 3226 static const int kAccumulationIntegerBits = 12; 3227 using FixedPointScaledDiff = 3228 gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; 3229 using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; 3230 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 3231 3232 gemmlowp::ScopedProfilingLabel label("Softmax/8bit"); 3233 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 3234 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 3235 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 3236 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 3237 3238 const int outer_size = batches * height * width; 3239 3240 for (int b = 0; b < outer_size; ++b) { 3241 const uint8* input_data_ptr = input_data + b * depth; 3242 uint8* output_data_ptr = output_data + b * depth; 3243 3244 // Determine the largest entry in the current row 3245 uint8 max_in_row = 0; 3246 { 3247 int c = 0; 3248#ifdef USE_NEON 3249 uint8x16_t max16_0 = vdupq_n_u8(0); 3250 uint8x16_t max16_1 = vdupq_n_u8(0); 3251 for (; c <= depth - 32; c += 32) { 3252 max16_0 = vmaxq_u8(max16_0, vld1q_u8(input_data_ptr + c + 0)); 3253 max16_1 = vmaxq_u8(max16_1, vld1q_u8(input_data_ptr + c + 16)); 3254 } 3255 uint8x16_t max16 = vmaxq_u8(max16_0, max16_1); 3256 if (c <= depth - 16) { 3257 max16 = vmaxq_u8(max16, vld1q_u8(input_data_ptr + c)); 3258 c += 16; 3259 } 3260 uint8x8_t max8 = vmax_u8(vget_low_u8(max16), vget_high_u8(max16)); 3261 if (c <= depth - 8) { 3262 max8 = vmax_u8(max8, vld1_u8(input_data_ptr + c)); 3263 c += 8; 3264 } 3265 uint8x8_t max4 = vmax_u8(max8, vext_u8(max8, max8, 4)); 3266 uint8x8_t max2 = vmax_u8(max4, vext_u8(max4, max4, 2)); 3267 uint8x8_t max1 = vpmax_u8(max2, max2); 3268 max_in_row = vget_lane_u8(max1, 0); 3269#endif 3270 for (; c < depth; ++c) { 3271 max_in_row = std::max(max_in_row, input_data_ptr[c]); 3272 } 3273 } 3274 3275#ifdef USE_NEON 3276 using FixedPointAccumInt32x4 = 3277 gemmlowp::FixedPoint<int32x4_t, kAccumulationIntegerBits>; 3278 using FixedPointScaledDiffInt32x4 = 3279 gemmlowp::FixedPoint<int32x4_t, kScaledDiffIntegerBits>; 3280 using FixedPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>; 3281 FixedPoint0Int32x4 input_beta_multiplier_f0 = 3282 FixedPoint0Int32x4::FromScalarRaw(input_beta_multiplier); 3283 int16x8_t max_in_row_s16 = vdupq_n_s16(max_in_row); 3284#endif 3285 3286 // Compute the sum of exponentials of the differences of entries in the 3287 // current row from the largest entry in the current row. 3288 FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); 3289 { 3290 int c = 0; 3291#ifdef USE_NEON 3292 int32x4_t diff_min_s32 = vdupq_n_s32(diff_min); 3293 FixedPointAccumInt32x4 sum_of_exps_0 = FixedPointAccumInt32x4::Zero(); 3294 FixedPointAccumInt32x4 sum_of_exps_1 = FixedPointAccumInt32x4::Zero(); 3295 FixedPointAccumInt32x4 zeros = FixedPointAccumInt32x4::Zero(); 3296 for (; c <= depth - 8; c += 8) { 3297 uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c)); 3298 int16x8_t input_diff_s16 = 3299 vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16); 3300 int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16)); 3301 int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16)); 3302 int32x4_t mask_0 = 3303 gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_0, diff_min_s32); 3304 int32x4_t mask_1 = 3305 gemmlowp::MaskIfGreaterThanOrEqual(input_diff_s32_1, diff_min_s32); 3306 FixedPointScaledDiffInt32x4 scaled_diff_0 = 3307 input_beta_multiplier_f0 * 3308 FixedPointScaledDiffInt32x4::FromRaw( 3309 gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift)); 3310 FixedPointScaledDiffInt32x4 scaled_diff_1 = 3311 input_beta_multiplier_f0 * 3312 FixedPointScaledDiffInt32x4::FromRaw( 3313 gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift)); 3314 FixedPointAccumInt32x4 exps_0 = 3315 gemmlowp::Rescale<kAccumulationIntegerBits>( 3316 exp_on_negative_values(scaled_diff_0)); 3317 FixedPointAccumInt32x4 exps_1 = 3318 gemmlowp::Rescale<kAccumulationIntegerBits>( 3319 exp_on_negative_values(scaled_diff_1)); 3320 FixedPointAccumInt32x4 masked_exps_0 = 3321 SelectUsingMask(mask_0, exps_0, zeros); 3322 FixedPointAccumInt32x4 masked_exps_1 = 3323 SelectUsingMask(mask_1, exps_1, zeros); 3324 sum_of_exps_0 = sum_of_exps_0 + masked_exps_0; 3325 sum_of_exps_1 = sum_of_exps_1 + masked_exps_1; 3326 } 3327 int32x4_t sum_of_exps_reduced_4 = (sum_of_exps_0 + sum_of_exps_1).raw(); 3328 int32x2_t sum_of_exps_reduced_2 = 3329 vadd_s32(vget_low_s32(sum_of_exps_reduced_4), 3330 vget_high_s32(sum_of_exps_reduced_4)); 3331 int32x2_t sum_of_exps_reduced_1 = 3332 vpadd_s32(sum_of_exps_reduced_2, sum_of_exps_reduced_2); 3333 sum_of_exps = 3334 FixedPointAccum::FromRaw(vget_lane_s32(sum_of_exps_reduced_1, 0)); 3335#endif 3336 for (; c < depth; ++c) { 3337 int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row; 3338 if (input_diff >= diff_min) { 3339 const int32 input_diff_rescaled = 3340 MultiplyByQuantizedMultiplierGreaterThanOne( 3341 input_diff, input_beta_multiplier, input_beta_left_shift); 3342 const FixedPointScaledDiff scaled_diff_f8 = 3343 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 3344 sum_of_exps = 3345 sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( 3346 exp_on_negative_values(scaled_diff_f8)); 3347 } 3348 } 3349 } 3350 3351 // Compute the fixed-point multiplier and shift that we need to apply to 3352 // perform a division by the above-computed sum-of-exponentials. 3353 int32 fixed_sum_of_exps = sum_of_exps.raw(); 3354 int headroom_plus_one = 3355 __builtin_clz(static_cast<uint32>(fixed_sum_of_exps)); 3356 // This is the number of bits to the left of the binary point above 1.0. 3357 // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and 3358 // no later adjustment will be needed. 3359 int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; 3360 int32 shifted_sum_minus_one = static_cast<int32>( 3361 (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - 3362 (static_cast<uint32>(1) << 31)); 3363 FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( 3364 FixedPoint0::FromRaw(shifted_sum_minus_one)); 3365 3366 // Compute the quotients of exponentials of differences of entries in the 3367 // current row from the largest entry, over the previously-computed sum of 3368 // exponentials. 3369 { 3370 int c = 0; 3371#ifdef USE_NEON 3372 int16x8_t diff_min_s16 = vdupq_n_s16(diff_min); 3373 for (; c <= depth - 8; c += 8) { 3374 uint16x8_t input_u16 = vmovl_u8(vld1_u8(input_data_ptr + c)); 3375 int16x8_t input_diff_s16 = 3376 vsubq_s16(vreinterpretq_s16_u16(input_u16), max_in_row_s16); 3377 int32x4_t input_diff_s32_0 = vmovl_s16(vget_low_s16(input_diff_s16)); 3378 int32x4_t input_diff_s32_1 = vmovl_s16(vget_high_s16(input_diff_s16)); 3379 uint8x8_t mask = vmovn_u16(vcgeq_s16(input_diff_s16, diff_min_s16)); 3380 FixedPointScaledDiffInt32x4 scaled_diff_0 = 3381 input_beta_multiplier_f0 * 3382 FixedPointScaledDiffInt32x4::FromRaw( 3383 gemmlowp::ShiftLeft(input_diff_s32_0, input_beta_left_shift)); 3384 FixedPointScaledDiffInt32x4 scaled_diff_1 = 3385 input_beta_multiplier_f0 * 3386 FixedPointScaledDiffInt32x4::FromRaw( 3387 gemmlowp::ShiftLeft(input_diff_s32_1, input_beta_left_shift)); 3388 FixedPoint0Int32x4 exp_0 = exp_on_negative_values(scaled_diff_0); 3389 FixedPoint0Int32x4 exp_1 = exp_on_negative_values(scaled_diff_1); 3390 int32x4_t output_s32_0 = gemmlowp::RoundingDivideByPOT( 3391 vqrdmulhq_n_s32(exp_0.raw(), shifted_scale.raw()), 3392 num_bits_over_unit + 31 - 8); 3393 int32x4_t output_s32_1 = gemmlowp::RoundingDivideByPOT( 3394 vqrdmulhq_n_s32(exp_1.raw(), shifted_scale.raw()), 3395 num_bits_over_unit + 31 - 8); 3396 int16x8_t output_s16 = 3397 vcombine_s16(vqmovn_s32(output_s32_0), vqmovn_s32(output_s32_1)); 3398 uint8x8_t output_u8 = vqmovun_s16(output_s16); 3399 uint8x8_t masked_output = vbsl_u8(mask, output_u8, vdup_n_u8(0)); 3400 vst1_u8(output_data_ptr + c, masked_output); 3401 } 3402#endif 3403 for (; c < depth; ++c) { 3404 int32 input_diff = static_cast<int32>(input_data_ptr[c]) - max_in_row; 3405 if (input_diff >= diff_min) { 3406 const int32 input_diff_rescaled = 3407 MultiplyByQuantizedMultiplierGreaterThanOne( 3408 input_diff, input_beta_multiplier, input_beta_left_shift); 3409 const FixedPointScaledDiff scaled_diff_f8 = 3410 FixedPointScaledDiff::FromRaw(input_diff_rescaled); 3411 3412 FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); 3413 int32 unsat_output = gemmlowp::RoundingDivideByPOT( 3414 (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); 3415 3416 output_data_ptr[c] = std::max(std::min(unsat_output, 255), 0); 3417 3418 } else { 3419 output_data_ptr[c] = 0; 3420 } 3421 } 3422 } 3423 } 3424} 3425 3426inline void Logistic(const float* input_data, const Dims<4>& input_dims, 3427 float* output_data, const Dims<4>& output_dims) { 3428 gemmlowp::ScopedProfilingLabel label("Logistic"); 3429 auto input_map = MapAsVector(input_data, input_dims); 3430 auto output_map = MapAsVector(output_data, output_dims); 3431 output_map.array() = 3432 input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()); 3433} 3434 3435inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, 3436 int32 input_zero_point, int32 input_range_radius, 3437 int32 input_multiplier, int input_left_shift, 3438 uint8* output_data, const Dims<4>& output_dims) { 3439 gemmlowp::ScopedProfilingLabel label("Logistic"); 3440 /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3); 3441 /* height */ MatchingArraySize(input_dims, 2, output_dims, 2); 3442 /* width */ MatchingArraySize(input_dims, 1, output_dims, 1); 3443 /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0); 3444 const int size = RequiredBufferSizeForDims(input_dims); 3445 3446 int c = 0; 3447#ifdef USE_NEON 3448 // Handle 16 values at a time 3449 for (; c <= size - 16; c += 16) { 3450 // Read input uint8 values, cast to int16 and subtract input_zero_point 3451 uint8x16_t input_val_u8 = vld1q_u8(input_data + c); 3452 int16x8_t input_val_centered_0 = 3453 vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))), 3454 vdupq_n_s16(input_zero_point)); 3455 int16x8_t input_val_centered_1 = 3456 vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))), 3457 vdupq_n_s16(input_zero_point)); 3458 3459 // Prepare the bit masks that we will use at the end to implement the logic 3460 // that was expressed in the scalar code with branching: 3461 // if (input_val_centered < -input_range_radius) { 3462 // output_val = 0; 3463 // } else if (input_val_centered > input_range_radius) { 3464 // output_val = 255; 3465 // } else { 3466 // ... 3467 uint16x8_t mask_rightclamp_0 = 3468 vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius)); 3469 uint16x8_t mask_rightclamp_1 = 3470 vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius)); 3471 uint16x8_t mask_leftclamp_0 = 3472 vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius)); 3473 uint16x8_t mask_leftclamp_1 = 3474 vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius)); 3475 uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8), 3476 vshrn_n_u16(mask_rightclamp_1, 8)); 3477 uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8), 3478 vshrn_n_u16(mask_leftclamp_1, 8)); 3479 3480 // This performs what is expressed in the scalar code as 3481 // const int32 input_val_rescaled = 3482 // MultiplyByQuantizedMultiplierGreaterThanOne( 3483 // input_val_centered, input_multiplier, input_left_shift); 3484 int32x4_t input_val_rescaled_0 = 3485 vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)), 3486 vdupq_n_s32(input_left_shift)); 3487 int32x4_t input_val_rescaled_1 = 3488 vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)), 3489 vdupq_n_s32(input_left_shift)); 3490 int32x4_t input_val_rescaled_2 = 3491 vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)), 3492 vdupq_n_s32(input_left_shift)); 3493 int32x4_t input_val_rescaled_3 = 3494 vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)), 3495 vdupq_n_s32(input_left_shift)); 3496 input_val_rescaled_0 = 3497 vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier); 3498 input_val_rescaled_1 = 3499 vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier); 3500 input_val_rescaled_2 = 3501 vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier); 3502 input_val_rescaled_3 = 3503 vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier); 3504 3505 // Invoke gemmlowp::logistic on FixedPoint wrapping int32x4_t 3506 using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>; 3507 using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>; 3508 const FixedPoint4 input_val_f4_0 = 3509 FixedPoint4::FromRaw(input_val_rescaled_0); 3510 const FixedPoint4 input_val_f4_1 = 3511 FixedPoint4::FromRaw(input_val_rescaled_1); 3512 const FixedPoint4 input_val_f4_2 = 3513 FixedPoint4::FromRaw(input_val_rescaled_2); 3514 const FixedPoint4 input_val_f4_3 = 3515 FixedPoint4::FromRaw(input_val_rescaled_3); 3516 const FixedPoint0 output_val_f0_0 = gemmlowp::logistic(input_val_f4_0); 3517 const FixedPoint0 output_val_f0_1 = gemmlowp::logistic(input_val_f4_1); 3518 const FixedPoint0 output_val_f0_2 = gemmlowp::logistic(input_val_f4_2); 3519 const FixedPoint0 output_val_f0_3 = gemmlowp::logistic(input_val_f4_3); 3520 3521 // Divide by 2^23 as in the scalar code 3522 using gemmlowp::RoundingDivideByPOT; 3523 int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 23); 3524 int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 23); 3525 int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 23); 3526 int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 23); 3527 3528 // Cast output values to uint8, saturating 3529 int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0), 3530 vqmovn_s32(output_val_s32_1)); 3531 int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2), 3532 vqmovn_s32(output_val_s32_3)); 3533 uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0), 3534 vqmovun_s16(output_val_s16_1)); 3535 3536 // Perform the bit-masking with the bit masks computed at the beginning, 3537 // see the comment there. 3538 output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp); 3539 output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp); 3540 3541 // Store back to memory 3542 vst1q_u8(output_data + c, output_val_u8); 3543 } 3544#endif 3545 // Leftover loop: handle one value at a time with scalar code. 3546 for (; c < size; ++c) { 3547 const uint8 input_val_u8 = input_data[c]; 3548 const int32 input_val_centered = 3549 static_cast<int32>(input_val_u8) - input_zero_point; 3550 uint8 output_val; 3551 if (input_val_centered < -input_range_radius) { 3552 output_val = 0; 3553 } else if (input_val_centered > input_range_radius) { 3554 output_val = 255; 3555 } else { 3556 const int32 input_val_rescaled = 3557 MultiplyByQuantizedMultiplierGreaterThanOne( 3558 input_val_centered, input_multiplier, input_left_shift); 3559 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 3560 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 3561 const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); 3562 const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); 3563 using gemmlowp::RoundingDivideByPOT; 3564 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); 3565 if (output_val_s32 == 256) { 3566 output_val_s32 = 255; 3567 } 3568 TFLITE_DCHECK_GE(output_val_s32, 0); 3569 TFLITE_DCHECK_LE(output_val_s32, 255); 3570 output_val = static_cast<uint8>(output_val_s32); 3571 } 3572 output_data[c] = output_val; 3573 } 3574} 3575 3576inline void Tanh(const float* input_data, const Dims<4>& input_dims, 3577 float* output_data, const Dims<4>& output_dims) { 3578 gemmlowp::ScopedProfilingLabel label("Tanh"); 3579 auto input_map = MapAsVector(input_data, input_dims); 3580 auto output_map = MapAsVector(output_data, output_dims); 3581 output_map.array() = input_map.array().tanh(); 3582} 3583 3584inline void Tanh(const uint8* input_data, const Dims<4>& input_dims, 3585 int32 input_zero_point, int32 input_range_radius, 3586 int32 input_multiplier, int input_left_shift, 3587 uint8* output_data, const Dims<4>& output_dims) { 3588 // Note that this is almost the exact same code as in Logistic(). 3589 gemmlowp::ScopedProfilingLabel label("Tanh"); 3590 /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3); 3591 /* height */ MatchingArraySize(input_dims, 2, output_dims, 2); 3592 /* width */ MatchingArraySize(input_dims, 1, output_dims, 1); 3593 /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0); 3594 const int size = RequiredBufferSizeForDims(input_dims); 3595 3596 int c = 0; 3597 int32_t output_zero_point = 128; 3598#ifdef USE_NEON 3599 // Handle 16 values at a time 3600 for (; c <= size - 16; c += 16) { 3601 // Read input uint8 values, cast to int16 and subtract input_zero_point 3602 uint8x16_t input_val_u8 = vld1q_u8(input_data + c); 3603 int16x8_t input_val_centered_0 = 3604 vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input_val_u8))), 3605 vdupq_n_s16(input_zero_point)); 3606 int16x8_t input_val_centered_1 = 3607 vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input_val_u8))), 3608 vdupq_n_s16(input_zero_point)); 3609 3610 // Prepare the bit masks that we will use at the end to implement the logic 3611 // that was expressed in the scalar code with branching: 3612 // if (input_val_centered < -input_range_radius) { 3613 // output_val = 0; 3614 // } else if (input_val_centered > input_range_radius) { 3615 // output_val = 255; 3616 // } else { 3617 // ... 3618 uint16x8_t mask_rightclamp_0 = 3619 vcgtq_s16(input_val_centered_0, vdupq_n_s16(input_range_radius)); 3620 uint16x8_t mask_rightclamp_1 = 3621 vcgtq_s16(input_val_centered_1, vdupq_n_s16(input_range_radius)); 3622 uint16x8_t mask_leftclamp_0 = 3623 vcgeq_s16(input_val_centered_0, vdupq_n_s16(-input_range_radius)); 3624 uint16x8_t mask_leftclamp_1 = 3625 vcgeq_s16(input_val_centered_1, vdupq_n_s16(-input_range_radius)); 3626 uint8x16_t mask_rightclamp = vcombine_u8(vshrn_n_u16(mask_rightclamp_0, 8), 3627 vshrn_n_u16(mask_rightclamp_1, 8)); 3628 uint8x16_t mask_leftclamp = vcombine_u8(vshrn_n_u16(mask_leftclamp_0, 8), 3629 vshrn_n_u16(mask_leftclamp_1, 8)); 3630 3631 // This performs what is expressed in the scalar code as 3632 // const int32 input_val_rescaled = 3633 // MultiplyByQuantizedMultiplierGreaterThanOne( 3634 // input_val_centered, input_multiplier, input_left_shift); 3635 int32x4_t input_val_rescaled_0 = 3636 vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_0)), 3637 vdupq_n_s32(input_left_shift)); 3638 int32x4_t input_val_rescaled_1 = 3639 vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_0)), 3640 vdupq_n_s32(input_left_shift)); 3641 int32x4_t input_val_rescaled_2 = 3642 vshlq_s32(vmovl_s16(vget_low_s16(input_val_centered_1)), 3643 vdupq_n_s32(input_left_shift)); 3644 int32x4_t input_val_rescaled_3 = 3645 vshlq_s32(vmovl_s16(vget_high_s16(input_val_centered_1)), 3646 vdupq_n_s32(input_left_shift)); 3647 input_val_rescaled_0 = 3648 vqrdmulhq_n_s32(input_val_rescaled_0, input_multiplier); 3649 input_val_rescaled_1 = 3650 vqrdmulhq_n_s32(input_val_rescaled_1, input_multiplier); 3651 input_val_rescaled_2 = 3652 vqrdmulhq_n_s32(input_val_rescaled_2, input_multiplier); 3653 input_val_rescaled_3 = 3654 vqrdmulhq_n_s32(input_val_rescaled_3, input_multiplier); 3655 3656 // Invoke gemmlowp::tanh on FixedPoint wrapping int32x4_t 3657 using FixedPoint4 = gemmlowp::FixedPoint<int32x4_t, 4>; 3658 using FixedPoint0 = gemmlowp::FixedPoint<int32x4_t, 0>; 3659 const FixedPoint4 input_val_f4_0 = 3660 FixedPoint4::FromRaw(input_val_rescaled_0); 3661 const FixedPoint4 input_val_f4_1 = 3662 FixedPoint4::FromRaw(input_val_rescaled_1); 3663 const FixedPoint4 input_val_f4_2 = 3664 FixedPoint4::FromRaw(input_val_rescaled_2); 3665 const FixedPoint4 input_val_f4_3 = 3666 FixedPoint4::FromRaw(input_val_rescaled_3); 3667 const FixedPoint0 output_val_f0_0 = gemmlowp::tanh(input_val_f4_0); 3668 const FixedPoint0 output_val_f0_1 = gemmlowp::tanh(input_val_f4_1); 3669 const FixedPoint0 output_val_f0_2 = gemmlowp::tanh(input_val_f4_2); 3670 const FixedPoint0 output_val_f0_3 = gemmlowp::tanh(input_val_f4_3); 3671 3672 // Divide by 2^24 as in the scalar code 3673 using gemmlowp::RoundingDivideByPOT; 3674 int32x4_t output_val_s32_0 = RoundingDivideByPOT(output_val_f0_0.raw(), 24); 3675 int32x4_t output_val_s32_1 = RoundingDivideByPOT(output_val_f0_1.raw(), 24); 3676 int32x4_t output_val_s32_2 = RoundingDivideByPOT(output_val_f0_2.raw(), 24); 3677 int32x4_t output_val_s32_3 = RoundingDivideByPOT(output_val_f0_3.raw(), 24); 3678 3679 // Add the output zero point 3680 int32x4_t output_zero_point_s32 = vdupq_n_s32(output_zero_point); 3681 output_val_s32_0 = vaddq_s32(output_val_s32_0, output_zero_point_s32); 3682 output_val_s32_1 = vaddq_s32(output_val_s32_1, output_zero_point_s32); 3683 output_val_s32_2 = vaddq_s32(output_val_s32_2, output_zero_point_s32); 3684 output_val_s32_3 = vaddq_s32(output_val_s32_3, output_zero_point_s32); 3685 3686 // Cast output values to uint8, saturating 3687 int16x8_t output_val_s16_0 = vcombine_s16(vqmovn_s32(output_val_s32_0), 3688 vqmovn_s32(output_val_s32_1)); 3689 int16x8_t output_val_s16_1 = vcombine_s16(vqmovn_s32(output_val_s32_2), 3690 vqmovn_s32(output_val_s32_3)); 3691 uint8x16_t output_val_u8 = vcombine_u8(vqmovun_s16(output_val_s16_0), 3692 vqmovun_s16(output_val_s16_1)); 3693 3694 // Perform the bit-masking with the bit masks computed at the beginning, 3695 // see the comment there. 3696 output_val_u8 = vorrq_u8(output_val_u8, mask_rightclamp); 3697 output_val_u8 = vandq_u8(output_val_u8, mask_leftclamp); 3698 3699 // Store back to memory 3700 vst1q_u8(output_data + c, output_val_u8); 3701 } 3702#endif 3703 // Leftover loop: handle one value at a time with scalar code. 3704 for (; c < size; ++c) { 3705 const uint8 input_val_u8 = input_data[c]; 3706 const int32 input_val_centered = 3707 static_cast<int32>(input_val_u8) - input_zero_point; 3708 uint8 output_val; 3709 if (input_val_centered < -input_range_radius) { 3710 output_val = 0; 3711 } else if (input_val_centered > input_range_radius) { 3712 output_val = 255; 3713 } else { 3714 const int32 input_val_rescaled = 3715 MultiplyByQuantizedMultiplierGreaterThanOne( 3716 input_val_centered, input_multiplier, input_left_shift); 3717 using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; 3718 using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; 3719 const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled); 3720 const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4); 3721 using gemmlowp::RoundingDivideByPOT; 3722 int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24); 3723 output_val_s32 += output_zero_point; 3724 if (output_val_s32 == 256) { 3725 output_val_s32 = 255; 3726 } 3727 TFLITE_DCHECK_GE(output_val_s32, 0); 3728 TFLITE_DCHECK_LE(output_val_s32, 255); 3729 output_val = static_cast<uint8>(output_val_s32); 3730 } 3731 output_data[c] = output_val; 3732 } 3733} 3734inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, 3735 int32 zero_point, double scale, float* output_data, 3736 const Dims<4>& output_dims) { 3737 gemmlowp::ScopedProfilingLabel label("Dequantize"); 3738 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 3739 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 3740 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 3741 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 3742 for (int b = 0; b < batches; ++b) { 3743 for (int y = 0; y < height; ++y) { 3744 for (int x = 0; x < width; ++x) { 3745 for (int c = 0; c < depth; ++c) { 3746 int32 val = input_data[Offset(input_dims, c, x, y, b)]; 3747 float result = static_cast<float>(scale * (val - zero_point)); 3748 output_data[Offset(output_dims, c, x, y, b)] = result; 3749 } 3750 } 3751 } 3752 } 3753} 3754 3755inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, 3756 float rmin, float rmax, float* output_data, 3757 const Dims<4>& output_dims) { 3758 gemmlowp::ScopedProfilingLabel label("FakeQuant"); 3759 3760 // 0 should always be a representable value. Let's assume that the initial 3761 // min,max range contains 0. 3762 TFLITE_DCHECK_LE(rmin, 0.); 3763 TFLITE_DCHECK_GE(rmax, 0.); 3764 3765 // Determine quantization parameters: zero_point, scale. 3766 using Integer = uint8; 3767 const Integer qmin = std::numeric_limits<Integer>::min(); 3768 const Integer qmax = std::numeric_limits<Integer>::max(); 3769 const float qmin_float = qmin; 3770 const float qmax_float = qmax; 3771 int32 zero_point = 0; 3772 float scale = 0.f; 3773 // If rmin==rmax, both must be zero per the above assertion, 3774 // so we are done. 3775 if (rmin != rmax) { 3776 // First determine the scale. 3777 scale = (rmax - rmin) / (qmax_float - qmin_float); 3778 3779 // Zero-point computation. 3780 // First the initial floating-point computation. The zero-point can be 3781 // determined from solving an affine equation for any known pair 3782 // (real value, corresponding quantized value). 3783 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 3784 // The arithmetic error on the zero point computed from either pair 3785 // will be roughly machine_epsilon * (sum of absolute values of terms) 3786 // so we want to use the variant that adds the smaller terms. 3787 const float zero_point_from_min = qmin_float - rmin / scale; 3788 const float zero_point_from_max = qmax_float - rmax / scale; 3789 const float zero_point_from_min_error = 3790 std::abs(qmin_float) + std::abs(rmin / scale); 3791 const float zero_point_from_max_error = 3792 std::abs(qmax_float) + std::abs(rmax / scale); 3793 3794 const float zero_point_float = 3795 zero_point_from_min_error < zero_point_from_max_error 3796 ? zero_point_from_min 3797 : zero_point_from_max; 3798 3799 // Now we need to nudge the zero point to be an integer 3800 // (our zero points are integer, and this is motivated by the requirement 3801 // to be able to represent the real value "0" exactly as a quantized value, 3802 // which is required in multiple places, for example in Im2col with SAME 3803 // padding). 3804 if (zero_point_float < qmin_float) { 3805 zero_point = qmin; 3806 } else if (zero_point_float > qmax_float) { 3807 zero_point = qmax; 3808 } else { 3809 zero_point = static_cast<int32>(TfLiteRound(zero_point_float)); 3810 } 3811 // The zero point should always be in the range of quantized value, 3812 // [qmin, qmax]. 3813 TFLITE_DCHECK_GE(zero_point, qmin); 3814 TFLITE_DCHECK_LE(zero_point, qmax); 3815 } 3816 3817 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 3818 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 3819 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 3820 const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); 3821 for (int b = 0; b < batches; ++b) { 3822 for (int y = 0; y < height; ++y) { 3823 for (int x = 0; x < width; ++x) { 3824 for (int c = 0; c < depth; ++c) { 3825 const float src_val = input_data[Offset(input_dims, c, x, y, b)]; 3826 const float unclamped_quantized_val = 3827 TfLiteRound(zero_point + src_val / scale); 3828 const float quantized_val = std::min( 3829 qmax_float, std::max(qmin_float, unclamped_quantized_val)); 3830 const float dst_val = scale * (quantized_val - zero_point); 3831 output_data[Offset(output_dims, c, x, y, b)] = dst_val; 3832 } 3833 } 3834 } 3835 } 3836} 3837 3838template <typename SrcT, typename DstT> 3839inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, 3840 DstT* output_data, const Dims<4>& output_dims) { 3841 gemmlowp::ScopedProfilingLabel label("Cast"); 3842 auto input_map = MapAsVector(input_data, input_dims); 3843 auto output_map = MapAsVector(output_data, output_dims); 3844 output_map.array() = input_map.array().template cast<DstT>(); 3845} 3846 3847inline void Floor(const float* input_data, const Dims<4>& input_dims, 3848 float* output_data, const Dims<4>& output_dims) { 3849 gemmlowp::ScopedProfilingLabel label("Floor"); 3850 auto input_map = MapAsVector(input_data, input_dims); 3851 auto output_map = MapAsVector(output_data, output_dims); 3852 output_map.array() = Eigen::floor(input_map.array()); 3853} 3854 3855template <typename T> 3856inline void Gather(const T* input_data, const Dims<4>& input_dims, 3857 int input_rank, const int32* coords_data, 3858 const Dims<4>& coords_dims, T* output_data, 3859 const Dims<4>& output_dims) { 3860 gemmlowp::ScopedProfilingLabel label("Gather"); 3861 3862 TFLITE_DCHECK(coords_dims.sizes[0] == output_dims.sizes[input_rank - 1]); 3863 int stride = input_dims.strides[input_rank - 1]; 3864 T* out = output_data; 3865 3866 for (int i = 0; i < coords_dims.sizes[0]; i++) { 3867 TFLITE_DCHECK_GE(coords_data[i], 0); 3868 TFLITE_DCHECK_LT(coords_data[i], input_dims.sizes[input_rank - 1]); 3869 const T* in = input_data + coords_data[i] * stride; 3870 memcpy(out, in, sizeof(T) * stride); 3871 out += stride; 3872 } 3873} 3874 3875#ifdef USE_NEON 3876inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, 3877 float scale, float* output_ptr) { 3878 int ic = 0; 3879 // Handle 32 input channels at a time. 3880 for (; ic <= depth - 32; ic += 32) { 3881 float32x4x2_t input[4]; 3882 for (int i = 0; i < 4; i++) { 3883 input[i].val[0] = vld1q_f32(input_ptr + 8 * i); 3884 input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4); 3885 } 3886 float32x4x2_t acc[4]; 3887 for (int i = 0; i < 4; i++) { 3888 acc[i].val[0] = vld1q_f32(output_ptr + 8 * i); 3889 acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4); 3890 } 3891 for (int i = 0; i < 4; i++) { 3892 acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale); 3893 acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale); 3894 } 3895 for (int i = 0; i < 4; i++) { 3896 vst1q_f32(output_ptr, acc[i].val[0]); 3897 vst1q_f32(output_ptr + 4, acc[i].val[1]); 3898 output_ptr += 8; 3899 } 3900 input_ptr += 32; 3901 } 3902 // Handle 16 input channels at a time. 3903 for (; ic <= depth - 16; ic += 16) { 3904 float32x4x2_t input[2]; 3905 for (int i = 0; i < 2; i++) { 3906 input[i].val[0] = vld1q_f32(input_ptr + 8 * i); 3907 input[i].val[1] = vld1q_f32(input_ptr + 8 * i + 4); 3908 } 3909 float32x4x2_t acc[2]; 3910 for (int i = 0; i < 2; i++) { 3911 acc[i].val[0] = vld1q_f32(output_ptr + 8 * i); 3912 acc[i].val[1] = vld1q_f32(output_ptr + 8 * i + 4); 3913 } 3914 for (int i = 0; i < 2; i++) { 3915 acc[i].val[0] = vmlaq_n_f32(acc[i].val[0], input[i].val[0], scale); 3916 acc[i].val[1] = vmlaq_n_f32(acc[i].val[1], input[i].val[1], scale); 3917 } 3918 for (int i = 0; i < 2; i++) { 3919 vst1q_f32(output_ptr, acc[i].val[0]); 3920 vst1q_f32(output_ptr + 4, acc[i].val[1]); 3921 output_ptr += 8; 3922 } 3923 input_ptr += 16; 3924 } 3925 // Handle 8 input channels at a time. 3926 for (; ic <= depth - 8; ic += 8) { 3927 float32x4x2_t input; 3928 input.val[0] = vld1q_f32(input_ptr); 3929 input.val[1] = vld1q_f32(input_ptr + 4); 3930 3931 float32x4x2_t acc; 3932 acc.val[0] = vld1q_f32(output_ptr); 3933 acc.val[1] = vld1q_f32(output_ptr + 4); 3934 acc.val[0] = vmlaq_n_f32(acc.val[0], input.val[0], scale); 3935 acc.val[1] = vmlaq_n_f32(acc.val[1], input.val[1], scale); 3936 3937 vst1q_f32(output_ptr, acc.val[0]); 3938 vst1q_f32(output_ptr + 4, acc.val[1]); 3939 3940 input_ptr += 8; 3941 output_ptr += 8; 3942 } 3943 // Handle 4 input channels at a time. 3944 for (; ic <= depth - 4; ic += 4) { 3945 float32x4_t input = vld1q_f32(input_ptr); 3946 float32x4_t acc = vld1q_f32(output_ptr); 3947 3948 acc = vmlaq_n_f32(acc, input, scale); 3949 vst1q_f32(output_ptr, acc); 3950 3951 input_ptr += 4; 3952 output_ptr += 4; 3953 } 3954 // Handle 1 input channel at a time. 3955 for (; ic < depth; ic++) { 3956 *output_ptr += *input_ptr * scale; 3957 output_ptr++; 3958 input_ptr++; 3959 } 3960} 3961#else 3962inline void ResizeBilinearKernel(const float* input_ptr, int32 depth, 3963 float scale, float* output_ptr) { 3964 for (int32 i = 0; i < depth; i++) { 3965 *output_ptr += *input_ptr * scale; 3966 output_ptr++; 3967 input_ptr++; 3968 } 3969} 3970#endif 3971 3972inline void ResizeBilinearKernel2x2(int32 x0, int32 x1, int32 y0, int32 y1, 3973 int32 x, int32 y, int32 depth, int32 batch, 3974 const float* input_data, 3975 const Dims<4>& input_dims, 3976 float* output_data, 3977 const Dims<4>& output_dims) { 3978 const int32 input_width = ArraySize(input_dims, 1); 3979 const int32 output_width = ArraySize(output_dims, 1); 3980 3981 const int32 input_x_offset = (x1 - x0) * depth; 3982 const int32 input_y_offset = (y1 - y0) * depth * input_width; 3983 const int32 output_x_offset = depth; 3984 const int32 output_y_offset = depth * output_width; 3985 3986#ifdef USE_NEON 3987 TFLITE_DCHECK(IsPackedWithoutStrides(input_dims)); 3988 TFLITE_DCHECK(x1 >= x0); 3989 TFLITE_DCHECK(y1 >= y0); 3990 3991 int ic = 0; 3992 // Handle 8 input channels at a time. 3993 for (; ic <= depth - 8; ic += 8) { 3994 const float* input_ptr = nullptr; 3995 3996 float32x4x2_t x0y0; 3997 input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)]; 3998 x0y0.val[0] = vld1q_f32(input_ptr); 3999 x0y0.val[1] = vld1q_f32(input_ptr + 4); 4000 4001 float32x4x2_t x1y0; 4002 input_ptr += input_x_offset; 4003 x1y0.val[0] = vld1q_f32(input_ptr); 4004 x1y0.val[1] = vld1q_f32(input_ptr + 4); 4005 4006 float32x4x2_t x0y1; 4007 input_ptr += -input_x_offset + input_y_offset; 4008 x0y1.val[0] = vld1q_f32(input_ptr); 4009 x0y1.val[1] = vld1q_f32(input_ptr + 4); 4010 4011 float32x4x2_t x1y1; 4012 input_ptr += input_x_offset; 4013 x1y1.val[0] = vld1q_f32(input_ptr); 4014 x1y1.val[1] = vld1q_f32(input_ptr + 4); 4015 4016 // Top left corner. 4017 float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)]; 4018 vst1q_f32(output_ptr, x0y0.val[0]); 4019 vst1q_f32(output_ptr + 4, x0y0.val[1]); 4020 4021 // Top right corner. 4022 output_ptr += output_x_offset; 4023 float32x4x2_t tr; 4024 tr.val[0] = vaddq_f32(x0y0.val[0], x1y0.val[0]); 4025 tr.val[1] = vaddq_f32(x0y0.val[1], x1y0.val[1]); 4026 tr.val[0] = vmulq_n_f32(tr.val[0], 0.5f); 4027 tr.val[1] = vmulq_n_f32(tr.val[1], 0.5f); 4028 4029 vst1q_f32(output_ptr, tr.val[0]); 4030 vst1q_f32(output_ptr + 4, tr.val[1]); 4031 4032 // Bottom left corner. 4033 output_ptr += -output_x_offset + output_y_offset; 4034 float32x4x2_t bl; 4035 bl.val[0] = vaddq_f32(x0y0.val[0], x0y1.val[0]); 4036 bl.val[1] = vaddq_f32(x0y0.val[1], x0y1.val[1]); 4037 bl.val[0] = vmulq_n_f32(bl.val[0], 0.5f); 4038 bl.val[1] = vmulq_n_f32(bl.val[1], 0.5f); 4039 vst1q_f32(output_ptr, bl.val[0]); 4040 vst1q_f32(output_ptr + 4, bl.val[1]); 4041 4042 // Bottom right corner. 4043 output_ptr += output_x_offset; 4044 float32x4x2_t br; 4045 br.val[0] = vaddq_f32(x1y0.val[0], x1y1.val[0]); 4046 br.val[1] = vaddq_f32(x1y0.val[1], x1y1.val[1]); 4047 br.val[0] = vmlaq_n_f32(bl.val[0], br.val[0], 0.5f); 4048 br.val[1] = vmlaq_n_f32(bl.val[1], br.val[1], 0.5f); 4049 br.val[0] = vmulq_n_f32(br.val[0], 0.5f); 4050 br.val[1] = vmulq_n_f32(br.val[1], 0.5f); 4051 vst1q_f32(output_ptr, br.val[0]); 4052 vst1q_f32(output_ptr + 4, br.val[1]); 4053 } 4054 // Handle 4 input channels at a time. 4055 for (; ic <= depth - 4; ic += 4) { 4056 const float* input_ptr = &input_data[Offset(input_dims, ic, x0, y0, batch)]; 4057 float32x4_t x0y0 = vld1q_f32(input_ptr); 4058 float32x4_t x1y0 = vld1q_f32(input_ptr + input_x_offset); 4059 float32x4_t x0y1 = vld1q_f32(input_ptr + input_y_offset); 4060 float32x4_t x1y1 = vld1q_f32(input_ptr + input_x_offset + input_y_offset); 4061 4062 // Top left corner. 4063 float* output_ptr = &output_data[Offset(output_dims, ic, x, y, batch)]; 4064 vst1q_f32(output_ptr, x0y0); 4065 4066 // Top right corner. 4067 output_ptr += output_x_offset; 4068 float32x4_t tr = vaddq_f32(x0y0, x1y0); 4069 tr = vmulq_n_f32(tr, 0.5f); 4070 vst1q_f32(output_ptr, tr); 4071 4072 // Bottom left corner. 4073 output_ptr += -output_x_offset + output_y_offset; 4074 float32x4_t bl = vaddq_f32(x0y0, x0y1); 4075 bl = vmulq_n_f32(bl, 0.5f); 4076 vst1q_f32(output_ptr, bl); 4077 4078 // Bottom right corner. 4079 output_ptr += output_x_offset; 4080 float32x4_t br = vaddq_f32(x1y0, x1y1); 4081 br = vmlaq_n_f32(bl, br, 0.5f); 4082 br = vmulq_n_f32(br, 0.5f); 4083 vst1q_f32(output_ptr, br); 4084 } 4085 // Handle one input channel at a time. 4086 for (; ic < depth; ic++) { 4087 const int32 input_offset = Offset(input_dims, ic, x0, y0, batch); 4088 4089 float x0y0 = input_data[input_offset]; 4090 float x1y0 = input_data[input_offset + input_x_offset]; 4091 float x0y1 = input_data[input_offset + input_y_offset]; 4092 float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; 4093 4094 // Top left corner. 4095 const int32 output_offset = Offset(output_dims, ic, x, y, batch); 4096 output_data[output_offset] = x0y0; 4097 4098 // Top right corner. 4099 output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2; 4100 4101 // Bottom left corner. 4102 float output = (x0y0 + x0y1) / 2; 4103 output_data[output_offset + output_y_offset] = output; 4104 4105 // Bottom right corner. 4106 output_data[output_offset + output_x_offset + output_y_offset] = 4107 (output + ((x1y0 + x1y1) / 2)) / 2; 4108 } 4109#else 4110 for (int ch = 0; ch < depth; ch++) { 4111 const int32 input_offset = Offset(input_dims, ch, x0, y0, batch); 4112 4113 float x0y0 = input_data[input_offset]; 4114 float x1y0 = input_data[input_offset + input_x_offset]; 4115 float x0y1 = input_data[input_offset + input_y_offset]; 4116 float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; 4117 4118 // Top left corner. 4119 const int32 output_offset = Offset(output_dims, ch, x, y, batch); 4120 output_data[output_offset] = x0y0; 4121 4122 // Top right corner. 4123 output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2; 4124 4125 // Bottom left corner. 4126 float output = (x0y0 + x0y1) / 2; 4127 output_data[output_offset + output_y_offset] = output; 4128 4129 // Bottom right corner. 4130 output_data[output_offset + output_x_offset + output_y_offset] = 4131 (output + ((x1y0 + x1y1) / 2)) / 2; 4132 } 4133#endif 4134} 4135 4136inline void ResizeBilinear2x2(const float* input_data, 4137 const Dims<4>& input_dims, float* output_data, 4138 const Dims<4>& output_dims, int32 batches, 4139 int32 input_height, int32 input_width, 4140 int32 depth, int32 output_height, 4141 int32 output_width) { 4142 for (int b = 0; b < batches; b++) { 4143 for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) { 4144 for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) { 4145 int32 x1 = std::min(x0 + 1, input_width - 1); 4146 int32 y1 = std::min(y0 + 1, input_height - 1); 4147 ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_data, 4148 input_dims, output_data, output_dims); 4149 } 4150 } 4151 } 4152} 4153 4154inline void ResizeBilinearGeneric(const float* input_data, 4155 const Dims<4>& input_dims, float* output_data, 4156 const Dims<4>& output_dims, int32 batches, 4157 int32 input_height, int32 input_width, 4158 int32 depth, int32 output_height, 4159 int32 output_width, float height_scale, 4160 float width_scale) { 4161 memset(output_data, 0, 4162 batches * output_height * output_width * depth * sizeof(float)); 4163 4164 int32 output_offset = 0; 4165 for (int b = 0; b < batches; ++b) { 4166 for (int y = 0; y < output_height; ++y) { 4167 float input_y = y * height_scale; 4168 int32 y0 = static_cast<int32>(std::floor(input_y)); 4169 int32 y1 = std::min(y0 + 1, input_height - 1); 4170 for (int x = 0; x < output_width; ++x) { 4171 float input_x = x * width_scale; 4172 int32 x0 = static_cast<int32>(input_x); 4173 int32 x1 = std::min(x0 + 1, input_width - 1); 4174 float* output_ptr = &output_data[output_offset]; 4175 4176 // Run kernel on the 4 corners of the bilinear resize algorithm. 4177 int32 input_offset = Offset(input_dims, 0, x0, y0, b); 4178 float scale = (1 - (input_y - y0)) * (1 - (input_x - x0)); 4179 const float* input_ptr = &input_data[input_offset]; 4180 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); 4181 4182 input_offset = Offset(input_dims, 0, x1, y0, b); 4183 scale = (1 - (input_y - y0)) * (input_x - x0); 4184 input_ptr = &input_data[input_offset]; 4185 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); 4186 4187 input_offset = Offset(input_dims, 0, x0, y1, b); 4188 scale = (input_y - y0) * (1 - (input_x - x0)); 4189 input_ptr = &input_data[input_offset]; 4190 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); 4191 4192 input_offset = Offset(input_dims, 0, x1, y1, b); 4193 scale = (input_y - y0) * (input_x - x0); 4194 input_ptr = &input_data[input_offset]; 4195 ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); 4196 4197 output_offset += depth; 4198 } 4199 } 4200 } 4201} 4202 4203inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 4204 const int32* output_size_data, 4205 const Dims<4>& output_size_dims, float* output_data, 4206 const Dims<4>& output_dims, bool align_corners) { 4207 gemmlowp::ScopedProfilingLabel label("ResizeBilinear"); 4208 int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); 4209 int32 input_height = ArraySize(input_dims, 2); 4210 int32 input_width = ArraySize(input_dims, 1); 4211 int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); 4212 4213 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1); 4214 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1); 4215 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1); 4216 TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2); 4217 int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; 4218 int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; 4219 4220 // Specialize for 2x2 upsample. 4221 if (!align_corners && output_height == 2 * input_height && 4222 output_width == 2 * input_width) { 4223 ResizeBilinear2x2(input_data, input_dims, output_data, output_dims, batches, 4224 input_height, input_width, depth, output_height, 4225 output_width); 4226 } else { 4227 float height_scale = static_cast<float>(input_height) / output_height; 4228 float width_scale = static_cast<float>(input_width) / output_width; 4229 if (align_corners && output_height > 1) { 4230 height_scale = static_cast<float>(input_height - 1) / (output_height - 1); 4231 } 4232 if (align_corners && output_width > 1) { 4233 width_scale = static_cast<float>(input_width - 1) / (output_width - 1); 4234 } 4235 4236 ResizeBilinearGeneric(input_data, input_dims, output_data, output_dims, 4237 batches, input_height, input_width, depth, 4238 output_height, output_width, height_scale, 4239 width_scale); 4240 } 4241} 4242 4243// legacy, for compatibility with old checked-in code 4244inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, 4245 const int32* output_size_data, 4246 const Dims<4>& output_size_dims, float* output_data, 4247 const Dims<4>& output_dims) { 4248 ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims, 4249 output_data, output_dims, /*align_corners=*/false); 4250} 4251 4252template <typename T> 4253inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims, 4254 const int32* block_shape_data, 4255 const Dims<4>& block_shape_dims, 4256 const int32* paddings_data, 4257 const Dims<4>& paddings_dims, T* output_data, 4258 const Dims<4>& output_dims) { 4259 gemmlowp::ScopedProfilingLabel label("SpaceToBatchND"); 4260 4261 const int output_batch_size = ArraySize(output_dims, 3); 4262 const int output_height = ArraySize(output_dims, 2); 4263 const int output_width = ArraySize(output_dims, 1); 4264 const int input_batch_size = ArraySize(input_dims, 3); 4265 const int input_height = ArraySize(input_dims, 2); 4266 const int input_width = ArraySize(input_dims, 1); 4267 const int depth = ArraySize(input_dims, 0); 4268 const int block_shape_height = block_shape_data[0]; 4269 const int block_shape_width = block_shape_data[1]; 4270 const int padding_top = paddings_data[0]; 4271 const int padding_left = paddings_data[2]; 4272 4273 for (int out_b = 0; out_b < output_batch_size; ++out_b) { 4274 int input_batch = out_b % input_batch_size; 4275 int shift_w = (out_b / input_batch_size) % block_shape_width; 4276 int shift_h = (out_b / input_batch_size) / block_shape_width; 4277 for (int out_h = 0; out_h < output_height; ++out_h) { 4278 for (int out_w = 0; out_w < output_width; ++out_w) { 4279 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_b); 4280 if (out_h * block_shape_height + shift_h < padding_top || 4281 out_h * block_shape_height + shift_h >= 4282 padding_top + input_height || 4283 out_w * block_shape_width + shift_w < padding_left || 4284 out_w * block_shape_width + shift_w >= padding_left + input_width) { 4285 memset(out, 0, depth * sizeof(T)); 4286 } else { 4287 const T* in = 4288 input_data + 4289 Offset(input_dims, 0, 4290 (out_w * block_shape_width + shift_w) - padding_left, 4291 (out_h * block_shape_height + shift_h) - padding_top, 4292 input_batch); 4293 memcpy(out, in, depth * sizeof(T)); 4294 } 4295 } 4296 } 4297 } 4298} 4299 4300template <typename T> 4301inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims, 4302 const int32* block_shape_data, 4303 const Dims<4>& block_shape_dims, T* output_data, 4304 const Dims<4>& output_dims) { 4305 gemmlowp::ScopedProfilingLabel label("BatchToSpaceND"); 4306 4307 const int output_batch_size = ArraySize(output_dims, 3); 4308 const int input_batch_size = ArraySize(input_dims, 3); 4309 const int input_height = ArraySize(input_dims, 2); 4310 const int input_width = ArraySize(input_dims, 1); 4311 const int depth = ArraySize(input_dims, 0); 4312 const int block_shape_width = block_shape_data[1]; 4313 const int block_shape_height = block_shape_data[0]; 4314 4315 for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) { 4316 for (int in_h = 0; in_h < input_height; ++in_h) { 4317 for (int in_w = 0; in_w < input_width; ++in_w) { 4318 int out_batch = in_batch % output_batch_size; 4319 int out_w = in_w * block_shape_width + 4320 (in_batch / output_batch_size) % block_shape_width; 4321 int out_h = in_h * block_shape_height + 4322 (in_batch / output_batch_size) / block_shape_width; 4323 T* out = output_data + Offset(output_dims, 0, out_w, out_h, out_batch); 4324 const T* in = input_data + Offset(input_dims, 0, in_w, in_h, in_batch); 4325 memcpy(out, in, depth * sizeof(T)); 4326 } 4327 } 4328 } 4329} 4330 4331template <typename T> 4332inline void Pad(const T* input_data, const Dims<4>& input_dims, 4333 const std::vector<int>& left_paddings, 4334 const std::vector<int>& right_paddings, T* output_data, 4335 const Dims<4>& output_dims) { 4336 gemmlowp::ScopedProfilingLabel label("Pad"); 4337 const int output_batch = ArraySize(output_dims, 3); 4338 const int output_height = ArraySize(output_dims, 2); 4339 const int output_width = ArraySize(output_dims, 1); 4340 const int output_depth = ArraySize(output_dims, 0); 4341 4342 const int left_b_padding = left_paddings[3]; 4343 const int left_h_padding = left_paddings[2]; 4344 const int left_w_padding = left_paddings[1]; 4345 const int left_d_padding = left_paddings[0]; 4346 4347 const int right_b_padding = right_paddings[3]; 4348 const int right_h_padding = right_paddings[2]; 4349 const int right_w_padding = right_paddings[1]; 4350 const int right_d_padding = right_paddings[0]; 4351 4352 const int input_depth = ArraySize(input_dims, 0); 4353 4354 if (left_b_padding != 0) { 4355 memset(output_data, 0, 4356 left_b_padding * output_height * output_width * output_depth * 4357 sizeof(T)); 4358 } 4359 for (int out_b = left_b_padding; out_b < output_batch - right_b_padding; 4360 ++out_b) { 4361 if (left_h_padding != 0) { 4362 memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0, 4363 left_h_padding * output_width * output_depth * sizeof(T)); 4364 } 4365 for (int out_h = left_h_padding; out_h < output_height - right_h_padding; 4366 ++out_h) { 4367 if (left_w_padding != 0) { 4368 memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0, 4369 left_w_padding * output_depth * sizeof(T)); 4370 } 4371 for (int out_w = left_w_padding; out_w < output_width - right_w_padding; 4372 ++out_w) { 4373 if (left_d_padding != 0) { 4374 memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0, 4375 left_d_padding * sizeof(T)); 4376 } 4377 4378 T* out = output_data + 4379 Offset(output_dims, left_d_padding, out_w, out_h, out_b); 4380 const T* in = 4381 input_data + Offset(input_dims, 0, out_w - left_w_padding, 4382 out_h - left_h_padding, out_b - left_b_padding); 4383 memcpy(out, in, input_depth * sizeof(T)); 4384 4385 if (right_d_padding != 0) { 4386 memset( 4387 output_data + Offset(output_dims, output_depth - right_d_padding, 4388 out_w, out_h, out_b), 4389 0, right_d_padding * sizeof(T)); 4390 } 4391 } 4392 if (right_w_padding != 0) { 4393 memset( 4394 output_data + Offset(output_dims, 0, output_width - right_w_padding, 4395 out_h, out_b), 4396 0, right_w_padding * output_depth * sizeof(T)); 4397 } 4398 } 4399 if (right_h_padding != 0) { 4400 memset(output_data + Offset(output_dims, 0, 0, 4401 output_height - right_h_padding, out_b), 4402 0, right_h_padding * output_width * output_depth * sizeof(T)); 4403 } 4404 } 4405 if (right_b_padding != 0) { 4406 memset(output_data + 4407 Offset(output_dims, 0, 0, 0, output_batch - right_b_padding), 4408 0, 4409 right_b_padding * output_height * output_width * output_depth * 4410 sizeof(T)); 4411 } 4412} 4413 4414template <typename T> 4415inline void StridedSlice(const T* input_data, const Dims<4>& input_dims, 4416 int begin_mask, int end_mask, 4417 const std::vector<int>& starts, 4418 const std::vector<int>& stops, 4419 const std::vector<int>& strides, T* output_data, 4420 const Dims<4>& output_dims) { 4421 gemmlowp::ScopedProfilingLabel label("StridedSlice"); 4422 const int start_b = (begin_mask & 8) ? 0 : starts[3]; 4423 const int stop_b = (end_mask & 8) ? input_dims.sizes[3] : stops[3]; 4424 const int start_h = (begin_mask & 4) ? 0 : starts[2]; 4425 const int stop_h = (end_mask & 4) ? input_dims.sizes[2] : stops[2]; 4426 const int start_w = (begin_mask & 2) ? 0 : starts[1]; 4427 const int stop_w = (end_mask & 2) ? input_dims.sizes[1] : stops[1]; 4428 const int start_d = (begin_mask & 1) ? 0 : starts[0]; 4429 const int stop_d = (end_mask & 1) ? input_dims.sizes[0] : stops[0]; 4430 4431 T* out_ptr = output_data; 4432 if (strides[0] == 0) { 4433 for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) { 4434 for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) { 4435 for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) { 4436 const int len = stop_d - start_d; 4437 memcpy(out_ptr, 4438 input_data + Offset(input_dims, start_d, in_w, in_h, in_b), 4439 len * sizeof(T)); 4440 out_ptr += len; 4441 } 4442 } 4443 } 4444 } else { 4445 for (int in_b = start_b; in_b < stop_b; in_b += strides[3]) { 4446 for (int in_h = start_h; in_h < stop_h; in_h += strides[2]) { 4447 for (int in_w = start_w; in_w < stop_w; in_w += strides[1]) { 4448 for (int in_d = start_d; in_d < stop_d; in_d += strides[0]) { 4449 *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)]; 4450 } 4451 } 4452 } 4453 } 4454 } 4455} 4456 4457template <typename T> 4458inline void Slice(const T* input_data, const Dims<4>& input_dims, 4459 const std::vector<int>& begin, const std::vector<int>& size, 4460 T* output_data, const Dims<4>& output_dims) { 4461 // TODO(dkalenichenko): This op only supports 4D tensors. 4462 TFLITE_DCHECK_EQ(begin.size(), 4); 4463 TFLITE_DCHECK_EQ(size.size(), 4); 4464 const int start_b = begin[3]; 4465 const int stop_b = 4466 size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3]; 4467 const int start_h = begin[2]; 4468 const int stop_h = 4469 size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2]; 4470 const int start_w = begin[1]; 4471 const int stop_w = 4472 size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1]; 4473 const int start_d = begin[0]; 4474 const int stop_d = 4475 size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0]; 4476 4477 T* out_ptr = output_data; 4478 for (int in_b = start_b; in_b < stop_b; ++in_b) { 4479 for (int in_h = start_h; in_h < stop_h; ++in_h) { 4480 for (int in_w = start_w; in_w < stop_w; ++in_w) { 4481 const int len = stop_d - start_d; 4482 memcpy(out_ptr, 4483 input_data + Offset(input_dims, start_d, in_w, in_h, in_b), 4484 len * sizeof(T)); 4485 out_ptr += len; 4486 } 4487 } 4488 } 4489} 4490 4491template <typename T> 4492inline void Mean(const T* input_data, const Dims<4>& input_dims, 4493 const std::vector<int>& reduction_indices, T* output_data, 4494 const Dims<4>& output_dims) { 4495 gemmlowp::ScopedProfilingLabel label("Mean"); 4496 const int output_batch = ArraySize(output_dims, 3); 4497 const int output_height = ArraySize(output_dims, 2); 4498 const int output_width = ArraySize(output_dims, 1); 4499 const int output_depth = ArraySize(output_dims, 0); 4500 4501 const int input_height = ArraySize(input_dims, 2); 4502 const int input_width = ArraySize(input_dims, 1); 4503 4504 // The current implementation only supports simultaneous reduction over 4505 // width and height. 4506 TFLITE_DCHECK_EQ(reduction_indices.size(), 2); 4507 TFLITE_DCHECK((reduction_indices[0] == 1 && reduction_indices[1] == 2) || 4508 (reduction_indices[0] == 2 && reduction_indices[1] == 1)); 4509 TFLITE_DCHECK_EQ(output_height, 1); 4510 TFLITE_DCHECK_EQ(output_width, 1); 4511 4512 for (int out_b = 0; out_b < output_batch; ++out_b) { 4513 for (int out_d = 0; out_d < output_depth; ++out_d) { 4514 float value = 0; 4515 for (int in_h = 0; in_h < input_height; ++in_h) { 4516 for (int in_w = 0; in_w < input_width; ++in_w) { 4517 value += input_data[Offset(input_dims, out_d, in_w, in_h, out_b)]; 4518 } 4519 } 4520 output_data[Offset(output_dims, out_d, 0, 0, out_b)] = 4521 value / (input_width * input_height); 4522 } 4523 } 4524} 4525 4526template <typename T> 4527void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims, 4528 const T* input2_data, const Dims<4>& input2_dims, 4529 T* output_data, const Dims<4>& output_dims) { 4530 gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub"); 4531 4532 NdArrayDesc<4> desc1; 4533 NdArrayDesc<4> desc2; 4534 NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); 4535 4536 // In Tensorflow, the dimensions are canonically named (batch_number, row, 4537 // col, channel), with extents (batches, height, width, depth), with the 4538 // trailing dimension changing most rapidly (channels has the smallest stride, 4539 // typically 1 element). 4540 // 4541 // In generated C code, we store arrays with the dimensions reversed. The 4542 // first dimension has smallest stride. 4543 // 4544 // We name our variables by their Tensorflow convention, but generate C code 4545 // nesting loops such that the innermost loop has the smallest stride for the 4546 // best cache behavior. 4547 for (int b = 0; b < ArraySize(output_dims, 3); ++b) { 4548 for (int y = 0; y < ArraySize(output_dims, 2); ++y) { 4549 for (int x = 0; x < ArraySize(output_dims, 1); ++x) { 4550 for (int c = 0; c < ArraySize(output_dims, 0); ++c) { 4551 output_data[Offset(output_dims, c, x, y, b)] = 4552 input1_data[SubscriptToIndex(desc1, c, x, y, b)] - 4553 input2_data[SubscriptToIndex(desc2, c, x, y, b)]; 4554 } 4555 } 4556 } 4557 } 4558} 4559 4560template <typename T> 4561void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, 4562 const Dims<4>& input2_dims, T* output_data, 4563 const Dims<4>& output_dims) { 4564 gemmlowp::ScopedProfilingLabel label("Sub"); 4565 4566 auto input1_map = MapAsVector(input1_data, input1_dims); 4567 auto input2_map = MapAsVector(input2_data, input2_dims); 4568 auto output_map = MapAsVector(output_data, output_dims); 4569 if (AreSameDims(input1_dims, input2_dims)) { 4570 output_map.array() = input1_map.array() - input2_map.array(); 4571 } else if (RequiredBufferSizeForDims(input1_dims) == 1) { 4572 auto scalar = input1_data[0]; 4573 output_map.array() = scalar - input2_map.array(); 4574 } else if (RequiredBufferSizeForDims(input2_dims) == 1) { 4575 auto scalar = input2_data[0]; 4576 output_map.array() = input1_map.array() - scalar; 4577 } else { 4578 GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims, 4579 output_data, output_dims); 4580 } 4581} 4582 4583template <typename T> 4584void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims, 4585 const T* input2_data, T* output_data, 4586 const Dims<4>& output_dims) { 4587 gemmlowp::ScopedProfilingLabel label("TensorFlowMinimum"); 4588 auto input1_map = MapAsVector(input1_data, input1_dims); 4589 auto output_map = MapAsVector(output_data, output_dims); 4590 auto min_value = input2_data[0]; 4591 output_map.array() = input1_map.array().min(min_value); 4592} 4593 4594template <typename T> 4595void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims, 4596 const T* input2_data, T* output_data, 4597 const Dims<4>& output_dims) { 4598 gemmlowp::ScopedProfilingLabel label("TensorFlowMaximum"); 4599 auto input1_map = MapAsVector(input1_data, input1_dims); 4600 auto output_map = MapAsVector(output_data, output_dims); 4601 auto max_value = input2_data[0]; 4602 output_map.array() = input1_map.array().max(max_value); 4603} 4604 4605template <typename T1, typename T2, typename T3> 4606void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims, 4607 T2* output_data, const Dims<4>& output_dims) { 4608 gemmlowp::ScopedProfilingLabel label("ArgMax"); 4609 4610 // The current ArgMax implemention can only determine the index of the maximum 4611 // value in the last dimension. So the axis argument is ignored. 4612 TFLITE_DCHECK_EQ(axis[0], 3); 4613 4614 // For ArgMax, the number of output dimensions = (number of input dimensions - 4615 // 1). For the sake of simplicity, the output dimensions are equal to the 4616 // input dimensions here. We enforce the constraint that the last dimension 4617 // must always be 1. 4618 TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1); 4619 const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 4620 const int height = MatchingArraySize(input_dims, 2, output_dims, 2); 4621 const int width = MatchingArraySize(input_dims, 1, output_dims, 1); 4622 const int depth = ArraySize(input_dims, 0); 4623 for (int b = 0; b < batches; ++b) { 4624 for (int y = 0; y < height; ++y) { 4625 for (int x = 0; x < width; ++x) { 4626 auto max_value = input_data[Offset(input_dims, 0, x, y, b)]; 4627 int max_index = 0; 4628 for (int d = 1; d < depth; ++d) { 4629 const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)]; 4630 if (curr_value > max_value) { 4631 max_value = curr_value; 4632 max_index = d; 4633 } 4634 } 4635 output_data[Offset(output_dims, 0, x, y, b)] = max_index; 4636 } 4637 } 4638 } 4639} 4640 4641} // namespace optimized_ops 4642} // namespace tflite 4643 4644#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS 4645#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS 4646#pragma GCC diagnostic pop 4647#endif 4648 4649#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_OPS_H_ 4650