10b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 20b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 30b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleLicensed under the Apache License, Version 2.0 (the "License"); 40b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleyou may not use this file except in compliance with the License. 50b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleYou may obtain a copy of the License at 60b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 70b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle http://www.apache.org/licenses/LICENSE-2.0 80b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 90b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleUnless required by applicable law or agreed to in writing, software 100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selledistributed under the License is distributed on an "AS IS" BASIS, 110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleSee the License for the specific language governing permissions and 130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellelimitations under the License. 140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle==============================================================================*/ 15f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 16f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "fixedpoint/fixedpoint.h" 190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "public/gemmlowp.h" 200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "tensorflow/contrib/lite/kernels/internal/common.h" 210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "tensorflow/contrib/lite/kernels/internal/types.h" 220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellenamespace tflite { 240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellenamespace optimized_ops { 250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Implementation of quantized DepthwiseConv 270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel {}; 300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON 320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 8, 2> { 340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8x2_t filter_u8; 400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[0] = vld1_u8(filter_ptr); 410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[1] = vld1_u8(filter_ptr + 8); 420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), 450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vdupq_n_s16(filter_offset)); 460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4x2_t acc[2]; 510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8x2_t input_dup2 = vzipq_s16(input, input); 620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), 650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input_dup2.val[i])); 660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), 670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input_dup2.val[i])); 680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 8, 1> { 810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 2 output pixels at a time. 920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8[2]; 1000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 1010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8[i] = vld1_u8(input_ptr + 8 * i); 1020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 16; 1040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input[2]; 1050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 1060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 1070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 1090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 1100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 1120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); 1130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = 1140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); 1150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); 1160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = 1170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); 1180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 1190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 1200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 1230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 1 output pixel at a time. 1250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 1260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 1270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 1280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vld1q_s32(acc_buffer_ptr); 1290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vld1q_s32(acc_buffer_ptr + 4); 1300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 1310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 1320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 1330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 1340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 1350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 1360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 1370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); 1380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); 1390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 1400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc[0]); 1410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4, acc[1]); 1420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 1430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 1460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 1470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 1480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 2> { 1490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 1500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 1510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 1520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 1530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 1540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 1550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 1560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 1570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 1580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 1590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 2 output pixels at a time. 1600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 1610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 1620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 1630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 1640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 1670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 1680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 1690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 1700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 1710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 1720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8x2_t input_dup2 = vzipq_s16(input, input); 1730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 1740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 1750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), 1760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input_dup2.val[i])); 1770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), 1780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input_dup2.val[i])); 1790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 1810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 1820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 1830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 1850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 1870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 1880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 1890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 1900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 1910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 1920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 1930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 1940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 1950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 1960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 1970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 1980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 1990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 2000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 2010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 2020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 2030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 2040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4x2_t input_dup2 = vzip_s16(input, input); 2050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 2060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); 2070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); 2080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 2090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 2100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 2110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 2130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 2160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 2170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 2180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 8> { 2190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 2200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 2210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 2220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 2230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 2240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 2250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 2260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 2270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 2280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 2290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 2310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle two output pixels at a time. 2320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 2330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 2340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[8]; 2350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 2360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 2370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 2390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 2400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 2410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 2420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 2430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 2440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 2450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 2460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 2470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 2480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 2490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 2500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 2510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 2520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 2530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); 2540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); 2550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); 2560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); 2570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 2580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 2590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 2600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 32; 2620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 2640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 2650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 2660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 2670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 2680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 2690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 2710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 2720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 2730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 2740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 2; 2750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 2760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 2770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 2780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 2790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 2800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 2810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); 2820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); 2830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); 2840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 2850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 2860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 2870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 2880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 2900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 2920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 2930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 2940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 2950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 2> { 2960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 2970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 2980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 2990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 3000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 3010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 3020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 3030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 3040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 3050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 3060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 3070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 3080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 3090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 3110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 4 output pixels at a time. 3120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 4; outp += 4) { 3130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 3140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 3150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 3160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 3170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 3200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 3210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 3220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 3230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 3240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 3250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8x2_t input_dup2 = vzipq_s16(input, input); 3260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 3270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 3280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 3290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 3300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 3310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 3320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 3330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 3340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 3360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 3380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 3390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 3400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc = vld1q_s32(acc_buffer_ptr); 3410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 3430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 3440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 3450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 2; 3460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 3470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 3480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 3490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 3500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; 3510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 3520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input_dup2); 3530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 3540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 3550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 3560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 3590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 3610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 1> { 3620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 3630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 3640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 3650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 3660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 3670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 3680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 3690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 3700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 3710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 3720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 3730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 3740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 3750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 3770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 output pixels at a time. 3780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 8; outp += 8) { 3790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 3800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 3810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 3820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 3830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 3850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8[2]; 3860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 3870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8[i] = vld1_u8(input_ptr + 8 * i); 3880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 16; 3900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input[2]; 3910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 3920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 3930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 3950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 3960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 3970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 3980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 3990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); 4000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); 4010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); 4020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); 4030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 4040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 4050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 4060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 4080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 4 output pixels at a time. 4100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 4; outp += 4) { 4110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 4120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 4130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 4140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 4150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 4170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 4180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 4190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 4200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 4210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 4230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); 4240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); 4250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 4260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 4270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 4280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 4300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 2 output pixels at a time. 4320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 4330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 4340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc = vld1q_s32(acc_buffer_ptr); 4350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 4360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 4370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 4380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 4390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 4400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 4410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 4420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 4430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 4440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 4450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 4470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input); 4480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 4490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 4500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 4510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 1 output pixel at a time. 4530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 4540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 4550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x2_t acc = vld1_s32(acc_buffer_ptr); 4560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 4570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 4580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 4590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 4600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 2; 4610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 4620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 4630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 4640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 4660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); 4670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 4680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_s32(acc_buffer_ptr, acc); 4690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 2; 4700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 4730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 4750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 1, 2> { 4760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 4770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 4780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 4790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 4800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 4810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 4820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 4830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 4840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 4850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 4860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 4870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 4880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 4890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 4910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 output pixels at a time. 4920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 8; outp += 8) { 4930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 4940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 4950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 4960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 4970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 4980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 4990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 5000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 5010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 5020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 5030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 5040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Duplicate the input values, 2-fold 5050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8x2_t input_dup2 = vzipq_s16(input, input); 5060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 5070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); 5080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); 5090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); 5100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); 5110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 5120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 5130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 5140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 5160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 5180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 5190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 5200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x2_t acc = vld1_s32(acc_buffer_ptr); 5210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 5230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint32 input = *input_ptr++ + input_offset; 5240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 5260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); 5270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 5280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_s32(acc_buffer_ptr, acc); 5290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 2; 5300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 5330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 5350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 1, 4> { 5360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 5370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 5380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 5390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 5400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 5410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 5420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 5430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 5440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 5450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 5460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 5470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 5480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 5490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 5510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 output pixels at a time. 5520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 8; outp += 8) { 5530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 5540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[8]; 5550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 5560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 5570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 5600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vld1_u8(input_ptr); 5610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 5620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 5630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 5640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 5660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); 5670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); 5680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); 5690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); 5700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); 5710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); 5720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); 5730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); 5740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 5760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 5770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 5780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 32; 5800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 4 output pixels at a time. 5820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 4; outp += 4) { 5830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 5840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 5850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 5860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 5870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 5880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 5890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 5900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 5910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 5920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 5930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 5940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 5950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 5960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 5970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 5980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 5990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 6010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); 6020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); 6030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); 6040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); 6050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 6070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 6080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 6090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 6110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 6130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 6140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 6150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc = vld1q_s32(acc_buffer_ptr); 6160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 6180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint32 input = *input_ptr++ + input_offset; 6190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 6210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_n_s16(acc, filter, input); 6220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 6230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 6240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 6250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 6280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 6300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 1> { 6310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 6320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 6330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 6340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 6350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 6360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 6370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 6380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 6390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 6400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 6410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 6420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 6430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 6440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 6460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 4 output pixels at a time. 6470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 4; outp += 4) { 6480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 6490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 6500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 6510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 6520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 6540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input[2]; 6550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 6560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); 6570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 6580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 6590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 16; 6610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 6620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 6630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 0] = 6640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); 6650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 1] = 6660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); 6670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 6690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 6700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 6710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 6730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 6750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 6760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 6770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc; 6780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vld1q_s32(acc_buffer_ptr); 6790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 6810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 6820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 6830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 6840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 6850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 6860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 6870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 6880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 6890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 6900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 6910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input); 6920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 6930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 6940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 6950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 6970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 6980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 6990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 7000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 4> { 7010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 7020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 7030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 7040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 7050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 7060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 7070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 7080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); 7090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 7100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 7110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 7140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 2 output pixels at a time. 7150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 7160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 7170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[8]; 7180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 7190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 7200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 7230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vld1_u8(input_ptr); 7240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 8; 7250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 7260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 7270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 7290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), 7300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input), 0); 7310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), 7320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input), 1); 7330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), 7340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input), 2); 7350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), 7360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input), 3); 7370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), 7380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input), 0); 7390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), 7400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input), 1); 7410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), 7420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input), 2); 7430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), 7440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input), 3); 7450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 7460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 8; i++) { 7470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 7480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 32; 7500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 7520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 7530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 7540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 7550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 7560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 7570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 7600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 7610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 7620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 7630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 7640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 7650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += 4; 7660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 7670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 7680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 7690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 7710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); 7720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); 7730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); 7740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); 7750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 7760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 7770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 7780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 7800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 7820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 7830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 7840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 7850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 3> { 7860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 7870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 7880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 7890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 7900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // We will have to duplicate bytes in a NEON register, 3-fold. 7910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // We will do that by register-level table-look-up using VTBL instructions. 7920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Here we prepare the registers containing the table-lookup indices. 7930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, 7940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle {2, 3, 3, 3, 4, 4, 4, 5}, 7950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle {5, 5, 6, 6, 6, 7, 7, 7}}; 7960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t dup3_indices[3]; 7970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 3; i++) { 7980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle dup3_indices[i] = vld1_u8(dup3_indices_array[i]); 7990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 8010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 8020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 8030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_filter_ptr = filter_ptr; 8040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_input_ptr = input_ptr; 8050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int ic = 0; 8060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 input channels at a time. 8070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic <= input_depth - 8; ic += 8) { 8080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 8090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[3]; 8100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8x3_t filter_u8; 8110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[0] = vld1_u8(local_filter_ptr); 8120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 8130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); 8140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 24; 8150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 3; i++) { 8160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = 8170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 8180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 8190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, duplicate 3-fold, add input_offset. 8210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 8220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_input_ptr += 8; 8230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 8240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8_dup3[3]; 8250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 3; i++) { 8260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); 8270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input_dup3[3]; 8290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 3; i++) { 8300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16_dup3 = 8310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); 8320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); 8330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 8350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4x3_t acc[2]; 8360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 8370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 8380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 8390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); 8400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 8420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 3; j++) { 8430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), 8440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(filter[j])); 8450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), 8460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(filter[j])); 8470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 8490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 8500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 8510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 8520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); 8530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 24; 8550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one input channel at a time. 8570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic < input_depth; ic++) { 8580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 input_val = *local_input_ptr++ + input_offset; 8590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 3; i++) { 8600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 filter_val = local_filter_ptr[i] + filter_offset; 8610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 8620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 3; 8640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 8660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 8690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 8700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 8710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 2> { 8720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 8730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 8740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 8750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 8760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 8770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 8780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_filter_ptr = filter_ptr; 8790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_input_ptr = input_ptr; 8800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int ic = 0; 8810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 input channels at a time. 8820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic <= input_depth - 8; ic += 8) { 8830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 8840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 8850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8x2_t filter_u8; 8860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[0] = vld1_u8(local_filter_ptr); 8870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); 8880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 16; 8890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 8900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = 8910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); 8920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 8930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 8940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset, duplicate 2-fold. 8950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 8960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_input_ptr += 8; 8970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 8980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 8990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8x2_t input_dup2 = vzipq_s16(input, input); 9000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 9010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4x2_t acc[2]; 9020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 9030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); 9040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); 9050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 9070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 2; j++) { 9080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), 9090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(input_dup2.val[j])); 9100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), 9110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(input_dup2.val[j])); 9120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 9140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 9150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); 9160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); 9170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 9190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one input channel at a time. 9210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic < input_depth; ic++) { 9220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs. 9230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 input_val = *local_input_ptr++ + input_offset; 9240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 9250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 filter_val = local_filter_ptr[i] + filter_offset; 9260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 9270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 2; 9290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 9310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 9340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 9350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 9360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 1> { 9370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 9380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 9390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 9400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 9410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 9420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 9430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_filter_ptr = filter_ptr; 9440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* local_input_ptr = input_ptr; 9450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int ic = 0; 9460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 16 input channels at a time. 9470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic <= input_depth - 16; ic += 16) { 9480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 9490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0); 9500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1); 9510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 16; 9520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 9530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 9540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 9550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 9560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 9570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0); 9580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1); 9590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_input_ptr += 16; 9600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); 9610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); 9620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); 9630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); 9640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 9650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 9660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 9670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 9680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 9690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); 9700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_1 = 9710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); 9720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); 9730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_3 = 9740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); 9750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 9760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 9770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 9780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 9790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 9800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 9810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 9820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 input channels at a time. 9830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic <= input_depth - 8; ic += 8) { 9840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 9850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); 9860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_filter_ptr += 8; 9870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 9880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter = 9890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 9900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 9910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(local_input_ptr); 9920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle local_input_ptr += 8; 9930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 9940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 9950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 9960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 9970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 9980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 9990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 10010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); 10020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); 10030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 10040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 10060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 10080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one input channel at a time. 10100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; ic < input_depth; ic++) { 10110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 input_val = *local_input_ptr++ + input_offset; 10120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 filter_val = *local_filter_ptr++ + filter_offset; 10130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 10140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 10160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 10190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 10200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 10210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 16, 1> { 10220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 10230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 10240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 10250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 10260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 10270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8[2]; 10280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 10300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 10320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 10340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 10370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 10390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 10400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 10410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8[2]; 10420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8[i] = vld1_u8(input_ptr + 8 * i); 10440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 10460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input[2]; 10470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); 10490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); 10520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 10540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 10550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 10560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 10570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 10590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), 10610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_s16(filter[i])); 10620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), 10630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_high_s16(filter[i])); 10640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 10660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 10670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 10680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 10700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 10730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 10740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 10750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 8, 1> { 10760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 10770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 10780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 10790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 10800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 10810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 10820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); 10830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); 10840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 10850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 10860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 10870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t input_u8 = vld1_u8(input_ptr); 10880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); 10890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); 10900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 10910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 10920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 10930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 10940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 10950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 10960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); 10970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); 10980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 10990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 11000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 11010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 11030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 11040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 11070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 11080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 11090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 16> { 11100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 11110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 11120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 11130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 11140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 11150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8[2]; 11160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 11170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8[i] = vld1_u8(filter_ptr + 8 * i); 11180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter[2]; 11200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 11210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); 11220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 11240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); 11250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 11270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 11280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8 input_u8 = *input_ptr; 11290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 11300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 input = static_cast<int16>(input_u8 + input_offset); 11310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 11320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 11330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 11340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 11350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 11370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 11380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 0] = 11390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); 11400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[2 * i + 1] = 11410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); 11420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 11440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 4; i++) { 11450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 11460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 16; 11480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 11500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 11510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 11520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 11530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 32> { 11540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 11550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 11560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 11570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 11580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 11590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); 11600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); 11610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2); 11620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3); 11630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 11640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 11650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2)); 11660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3)); 11670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 11680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 11690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset)); 11700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset)); 11710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 11720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 11730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8 input_u8 = *input_ptr; 11740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 11750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 input = static_cast<int16>(input_u8 + input_offset); 11760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 11770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 11780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 11790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 11800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 11810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); 11820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); 11830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); 11840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); 11850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 11860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); 11870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); 11880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); 11890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); 11900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); 11910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); 11920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); 11930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); 11940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 11950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 11960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 11970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 11980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 11990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); 12000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); 12010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); 12020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); 12030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 32; 12040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 12070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 12080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 12094d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlowerstruct QuantizedDepthwiseConvKernel<true, 1, 20> { 12104d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 12114d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower const uint8* input_ptr, int16 input_offset, 12124d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int input_ptr_increment, const uint8* filter_ptr, 12134d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int16 filter_offset, int32* acc_buffer_ptr) { 12144d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Load the filters, add filter_offset. 12154d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. 12164d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // We load the first 16 bytes into filter_u8_{0,1} as usual. 12174d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Then we load the 8 last bytes into filter_u8_x (x for 'extra'). 12184d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // This is redundant: the first 4 bytes of filter_u8_x are the same 12194d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // as the last 4 bytes of filter_u8_x. 12204d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0); 12214d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1); 12224d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4); 12234d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 12244d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 12254d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x)); 12264d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset)); 12274d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset)); 12284d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset)); 12294d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Handle one output pixel at a time. 12304d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower for (int outp = 0; outp < num_output_pixels; outp++) { 12314d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower uint8 input_u8 = *input_ptr; 12324d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower input_ptr += input_ptr_increment; 12334d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int16 input = static_cast<int16>(input_u8 + input_offset); 12344d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Load the accumulators from acc_buffer 12354d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 12364d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 12374d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 12384d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); 12394d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); 12404d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Multiply-accumulate 12414d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); 12424d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); 12434d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); 12444d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); 12454d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); 12464d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower // Store the accumulators back to acc_buffer 12474d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 12484d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 12494d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 12504d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); 12514d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); 12524d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower acc_buffer_ptr += 20; 12534d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower } 12544d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower } 12554d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower}; 12564d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower 12574d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlowertemplate <> 12580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 8> { 12590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 12600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 12610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 12620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 12630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 12640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t filter_u8 = vld1_u8(filter_ptr); 12650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t filter = vaddq_s16( 12660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); 12670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 12680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 12690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8 input_u8 = *input_ptr; 12700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 12710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 input = static_cast<int16>(input_u8 + input_offset); 12720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 12730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[2]; 12740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 12750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); 12760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 12780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); 12790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); 12800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 12810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int i = 0; i < 2; i++) { 12820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); 12830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 8; 12850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 12870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 12880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 12890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 12900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 2, 1> { 12910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 12920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 12930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 12940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 12950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 12960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 12970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 12980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 12990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); 13000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); 13010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 13020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 13030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 13040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 13060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 2 output pixels at a time. 13080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp <= num_output_pixels - 2; outp += 2) { 13090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 13100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc = vld1q_s32(acc_buffer_ptr); 13110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 13120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint16x4_t input_u16 = vdup_n_u16(0); 13130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], 13140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u16, 0); 13150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 13160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0], 13170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u16, 1); 13180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 13190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = vreinterpret_s16_u16( 13200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); 13210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 13220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 13240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input); 13250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 13260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 13270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 13280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 13290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 1 output pixel at a time. 13310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels; outp++) { 13320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer. 13330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x2_t acc = vld1_s32(acc_buffer_ptr); 13340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 13350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 13360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 13370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 13380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 13390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 13400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 13410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 13420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate. 13440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); 13450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer. 13460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_s32(acc_buffer_ptr, acc); 13470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 2; 13480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 13490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 13500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 13510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 13530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 4, 1> { 13540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 13550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 13560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 13570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 13580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (num_output_pixels <= 0) { 13590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle return; 13600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 13610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 13630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8 = vdup_n_u8(0); 13640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); 13650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); 13660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); 13670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); 13680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter_s16 = 13690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); 13700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); 13710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int outp = 0; 13730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time until second to the last pixel. Second 13750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // to the last because we read eight input pixels while only processing 13760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // four. 13770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; outp < num_output_pixels - 1; outp++) { 13780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 13790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc; 13800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vld1q_s32(acc_buffer_ptr); 13810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 13830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vld1_u8(input_ptr); 13840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 13850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 13860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 13870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 13880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 13890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input); 13900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 13910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 13920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 4; 13930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 13940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 13950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle the last output pixel. 13960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 13970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc; 13980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vld1q_s32(acc_buffer_ptr); 13990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 14010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8 = vdup_n_u8(0); 14020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); 14030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); 14040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); 14050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); 14060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input_s16 = 14070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); 14080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); 14090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 14100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmlal_s16(acc, filter, input); 14110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 14120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr, acc); 14130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 14140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 14150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <> 14170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 12, 1> { 14180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static void Run(int num_output_pixels, int input_depth, int depth_multiplier, 14190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr, int16 input_offset, 14200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int input_ptr_increment, const uint8* filter_ptr, 14210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 filter_offset, int32* acc_buffer_ptr) { 14220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the filters, add filter_offset. 14230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_0 = vld1_u8(filter_ptr); 14240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4); 14250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0)); 14260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1)); 14270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset)); 14280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset)); 14290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x4_t filter_0 = vget_low_s16(filter_s16_0); 14300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x4_t filter_1 = vget_high_s16(filter_s16_0); 14310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x4_t filter_2 = vget_high_s16(filter_s16_1); 14320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle one output pixel at a time. 14340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int outp = 0; outp < num_output_pixels; outp++) { 14350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the inputs, add input_offset. 14360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8_0 = vld1_u8(input_ptr); 14370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4); 14380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 14390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0)); 14400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1)); 14410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); 14420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); 14430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Load the accumulators from acc_buffer 14450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); 14460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); 14470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); 14480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Multiply-accumulate 14500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); 14510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); 14520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); 14530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Store the accumulators back to acc_buffer 14550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); 14560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); 14570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); 14580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer_ptr += 12; 14600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 14610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 14620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}; 14630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif 14640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 14650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Accumulates the effect of one row of the filter, on a segment of one row 14660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// of the output, accessing the corresponding one row of the input. 14670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> 14680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid QuantizedDepthwiseConvAccumRow( 14690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int stride, int input_depth, int input_width, const uint8* input_data, 14700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 14710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 14720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int out_x_buffer_end, int output_depth, int32* acc_buffer) { 14730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef GEMMLOWP_PROFILING 14740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); 14750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif 14760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Sanity check parameters. This is important in particular to ensure 14770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // that we keep the number of template instantiations minimal, so we don't 14780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // increase binary size unnecessarily. 14790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); 14800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static_assert(kFixedInputDepth || kAllowStrided, ""); 14810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK(stride == 1 || kAllowStrided); 14820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (kFixedInputDepth) { 14830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth); 14840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 14850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (kFixedDepthMultiplier) { 14860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); 14870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 14880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier); 14890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int input_ptr_increment = stride * input_depth; 14900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* filter_base_ptr = filter_data; 14910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 14920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // For the current (filter_x, filter_y) point in the filter, 14930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // compute the boundaries of the corresponding output row segment. 14940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int out_x_loop_start_unclampled = 0; 14950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int out_x_loop_end_unclampled = 0; 14960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (kAllowStrided) { 14970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (stride == 2) { 14980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; 14990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_end_unclampled = 15000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (pad_width + input_width - filter_x + 1) / 2; 15010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else if (stride == 4) { 15020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; 15030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_end_unclampled = 15040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (pad_width + input_width - filter_x + 3) / 4; 15050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else { 15060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_start_unclampled = 15070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (pad_width - filter_x + stride - 1) / stride; 15080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_end_unclampled = 15090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (pad_width + input_width - filter_x + stride - 1) / stride; 15100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else { 15120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_start_unclampled = pad_width - filter_x; 15130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_loop_end_unclampled = pad_width + input_width - filter_x; 15140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // The kernel will have to iterate on the segment of the 15160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // output row that starts at out_x_loop_start and out_x_loop_end. 15170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int out_x_loop_start = 15180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle std::max(out_x_buffer_start, out_x_loop_start_unclampled); 15190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int out_x_loop_end = 15200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle std::min(out_x_buffer_end, out_x_loop_end_unclampled); 15210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 15220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32* acc_buffer_ptr = 15230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 15240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 15250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr = input_data + in_x_origin * input_depth; 15260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int num_output_pixels = out_x_loop_end - out_x_loop_start; 15270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle QuantizedDepthwiseConvKernel< 15280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle kAllowStrided, kFixedInputDepth, 15290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, 15300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle depth_multiplier, input_ptr, input_offset, 15310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr_increment, filter_base_ptr, 15320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_offset, acc_buffer_ptr); 15330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_base_ptr += output_depth; 15340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 15360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 15370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. 15380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void QuantizedDepthwiseConvAccumRowGeneric( 15390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int stride, int input_depth, int input_width, const uint8* input_data, 15400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16 input_offset, int pad_width, int depth_multiplier, int filter_width, 15410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, 15420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int out_x_buffer_end, int output_depth, int32* acc_buffer) { 15430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); 15440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 15450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 15460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle LOG(FATAL) 15470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "\n\n" 15480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "*****************************************************************\n" 15490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* This tfmini inference code was about to use the slow generic\n" 15500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* fallback implementation for a DepthwiseConv op, and we want you\n" 15510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* to be aware of that so that you will know why you get terrible\n" 15520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* performance.\n" 15530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "*\n" 15540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* If you would like to carry on with the slow code, compile\n" 15550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* with this preprocessor token defined:\n" 15562b7d03c91d092cda88e6db345705fff3cd5b7b77A. Unique TensorFlower << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n" 15570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "*\n" 15580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* The right thing to do, if you care about performance, is to add\n" 15590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* a new DepthwiseConv kernel to tfmini to cover your case.\n" 15600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* The relevant parameters defining your case are:\n" 15610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* stride = " << stride << "\n" 15620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* input_depth = " << input_depth << "\n" 15630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* depth_multiplier = " << depth_multiplier << "\n" 15640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "*\n" 15650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* Please do not hesitate to contact benoitjacob@ with this\n" 15660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "* information.\n" 15670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle << "*****************************************************************\n"; 15680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 15690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK 15700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* filter_base_ptr = filter_data; 15710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int filter_x = 0; filter_x < filter_width; ++filter_x) { 15720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int out_x_loop_start = std::max( 15730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); 15740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int out_x_loop_end = 15750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle std::min(out_x_buffer_end, 15760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (pad_width + input_width - filter_x + stride - 1) / stride); 15770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 15780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32* acc_buffer_ptr = 15790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; 15800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; 15810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* input_ptr = input_data + in_x_origin * input_depth; 15820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int input_ptr_increment = (stride - 1) * input_depth; 15830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { 15840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8* filter_ptr = filter_base_ptr; 15850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int ic = 0; ic < input_depth; ++ic) { 15860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 input_val = *input_ptr++ + input_offset; 15870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int m = 0; m < depth_multiplier; m++) { 15880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16 filter_val = *filter_ptr++ + filter_offset; 15890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; 15900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_ptr += input_ptr_increment; 15930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_base_ptr += output_depth; 15950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 15960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 15970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 15980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Initializes the accumulator buffer with bias values. 15990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, 16000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32* bias_data, 16010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32* acc_buffer) { 16020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int i = 0; 16030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON 16040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (output_depth == 1) { 16050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b = vdupq_n_s32(bias_data[0]); 16060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 16; i += 16) { 16070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + i + 0, b); 16080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + i + 4, b); 16090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + i + 8, b); 16100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + i + 12, b); 16110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 4; i += 4) { 16130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + i, b); 16140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else if (output_depth == 2) { 16160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t b = vdupq_n_s32(bias_data[0]); 16170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle b = vsetq_lane_s32(bias_data[1], b, 1); 16180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle b = vsetq_lane_s32(bias_data[1], b, 3); 16190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 8; i += 8) { 16200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 2 * i + 0, b); 16210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 2 * i + 4, b); 16220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 2 * i + 8, b); 16230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 2 * i + 12, b); 16240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 2; i += 2) { 16260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 2 * i, b); 16270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else if (output_depth == 4) { 16290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b = vld1q_s32(bias_data); 16300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 4; i += 4) { 16310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 4 * i + 0, b); 16320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 4 * i + 4, b); 16330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 4 * i + 8, b); 16340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 4 * i + 12, b); 16350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i < num_output_pixels; i++) { 16370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 4 * i, b); 16380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else if (output_depth == 8) { 16400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b0 = vld1q_s32(bias_data); 16410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b1 = vld1q_s32(bias_data + 4); 16420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_pixels - 2; i += 2) { 16430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 0, b0); 16440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 4, b1); 16450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 8, b0); 16460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 12, b1); 16470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i < num_output_pixels; i++) { 16490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 0, b0); 16500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 8 * i + 4, b1); 16510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } else if (output_depth == 16) { 16530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b0 = vld1q_s32(bias_data); 16540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b1 = vld1q_s32(bias_data + 4); 16550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b2 = vld1q_s32(bias_data + 8); 16560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t b3 = vld1q_s32(bias_data + 12); 16570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i < num_output_pixels; i++) { 16580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 16 * i + 0, b0); 16590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 16 * i + 4, b1); 16600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 16 * i + 8, b2); 16610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_s32(acc_buffer + 16 * i + 12, b3); 16620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif 16650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i < num_output_pixels; i++) { 16660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle memcpy(acc_buffer + i * output_depth, bias_data, 16670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle sizeof(acc_buffer[0]) * output_depth); 16680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 16690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 16700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 16710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 16720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 input_offset, const uint8* filter_data, 16730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const Dims<4>& filter_dims, int32 filter_offset, 16740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32* bias_data, const Dims<4>& bias_dims, 16750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int stride_width, int stride_height, int pad_width, 16760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int pad_height, int depth_multiplier, 16770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_offset, int32 output_multiplier, 16780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int output_shift, int32 output_activation_min, 16790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_activation_max, uint8* output_data, 16800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const Dims<4>& output_dims) { 16810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); 16820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_LE(output_activation_min, output_activation_max); 16830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 16840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); 16850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); 16860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int input_height = ArraySize(input_dims, 2); 16870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int input_width = ArraySize(input_dims, 1); 16880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int input_depth = ArraySize(input_dims, 0); 16890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int filter_height = ArraySize(filter_dims, 2); 16900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int filter_width = ArraySize(filter_dims, 1); 16910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int output_height = ArraySize(output_dims, 2); 16920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int output_width = ArraySize(output_dims, 1); 16930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK(output_depth == input_depth * depth_multiplier); 16940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 16950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle static const int kAccBufferMaxSize = 2048; 16960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 acc_buffer[kAccBufferMaxSize]; 16970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth); 16980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; 16990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; 17000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, 17010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle kAccBufferActualSize); 17020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); 17030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1); 17040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // row_accum_func will point to the core accumulation function to be used 17060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // for this DepthwiseConv op. 17070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); 17080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle row_accum_func_t row_accum_func = nullptr; 17090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 17110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle FIXED_DEPTH_MULTIPLIER) \ 17120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ 17130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ 17140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ 17150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle row_accum_func = \ 17160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ 17170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle FIXED_DEPTH_MULTIPLIER>; \ 17180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 17190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON 17210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // We go over our list of kernels by decreasing order of preference 17220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // for the cases where multiple kernels could apply. 17230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Start with the fastest kernels: AllowStrided=false, fixed input depth. 17250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) 17270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) 17280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) 17290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) 17300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) 17310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) 17320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) 17330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) 17340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) 17350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) 17360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Next come the strided kernels: AllowStrided=true, fixed input depth. 17380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // They are a bit less efficient, but allow stride!=1. 17390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) 17410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) 17420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) 17434d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) 17440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) 17450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) 17460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) 17470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) 17480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) 17490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Finally, the kernels allowing a variable input depth, 17510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // these are the least efficient but most general kernels. 17520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) 17540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) 17550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) 17560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif // USE_NEON 17570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // No matching fast kernel found, use slow fallback. 17590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (!row_accum_func) { 17600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; 17610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 17620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#undef TFMINI_USE_DEPTHWISECONV_KERNEL 17640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 17650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Now that we have determined row_accum_func, we can start work. 17660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8* output_ptr = output_data; 17670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int b = 0; b < batches; ++b) { 17680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int out_y = 0; out_y < output_height; ++out_y) { 17690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int in_y_origin = (out_y * stride_height) - pad_height; 17700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int filter_y_start = std::max(0, -in_y_origin); 17710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int filter_y_end = 17720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle std::min(filter_height, input_height - in_y_origin); 17730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; 17740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_buffer_start += kOutputPixelsInAccBuffer) { 17750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int out_x_buffer_end = std::min( 17760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); 17770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // We call a 'pixel' a group of activation that share all but the 17780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // 'depth'/'channel' coordinate. num_output_pixels is the number of 17790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // output pixels that we will accumulate in this loop iteration. 17800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; 17810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Initialize our local accumulator with the bias values, so we don't 17820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // have to add them later. 17830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, 17840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_buffer); 17850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Accumulation loop. Most of the time should be spent in here. 17860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int filter_y = filter_y_start; filter_y < filter_y_end; 17870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle ++filter_y) { 17880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int in_y = in_y_origin + filter_y; 17890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle row_accum_func( 17900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle stride_width, input_depth, input_width, 17910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_data + in_y * input_dims.strides[2] + 17920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle b * input_dims.strides[3], 17930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle input_offset, pad_width, depth_multiplier, filter_width, 17940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_data + filter_y * filter_dims.strides[2], filter_offset, 17950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); 17960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 17970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Finished accumulating int32 values. Now need to convert them to 17980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // the final 8bit form and store them. 17990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle gemmlowp::ScopedProfilingLabel label("downquantize+store"); 18000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int num_output_values = output_depth * num_output_pixels; 18010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int i = 0; 18020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON 18030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle using gemmlowp::RoundingDivideByPOT; 18040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); 18050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t output_activation_min_vec = 18060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vdupq_n_s32(output_activation_min); 18070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32x4_t output_activation_max_vec = 18080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vdupq_n_s32(output_activation_max); 18090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 16 values at once. 18100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // This allows us to issue 4 mutually independent int32 18110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // multiplications (vqrdmulh), which should alleviate most of their 18120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // high latency. 18130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_values - 16; i += 16) { 18140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc[4]; 18150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = vld1q_s32(acc_buffer + i + 4 * j); 18170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 18190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Fixed-point multiplication. 18200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); 18220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = RoundingDivideByPOT(acc[j], output_shift); 18250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Add the output offset. 18270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = vaddq_s32(acc[j], output_offset_vec); 18290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Apply the activation function. 18310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); 18330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc[j] = vminq_s32(acc[j], output_activation_max_vec); 18360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Saturating cast to uint8 and store to destination. 18380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int16x4_t acc_s16[4]; 18390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (int j = 0; j < 4; j++) { 18400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc_s16[j] = vqmovn_s32(acc[j]); 18410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); 18430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); 18440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); 18450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); 18460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); 18470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_ptr += 16; 18480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 8 values at once. 18500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Not as good as 16 (now we're only issuing 2 mutually independent 18510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // vqrdmulh instructions, so we're probably paying for their high 18520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // latency). 18530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_values - 8; i += 8) { 18540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc0 = vld1q_s32(acc_buffer + i); 18550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); 18560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Fixed-point multiplication. 18570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); 18580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); 18590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Rounding right shift. 18600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc0 = RoundingDivideByPOT(acc0, output_shift); 18610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc1 = RoundingDivideByPOT(acc1, output_shift); 18620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Add the output offset. 18630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc0 = vaddq_s32(acc0, output_offset_vec); 18640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc1 = vaddq_s32(acc1, output_offset_vec); 18650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Apply the activation function. 18660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc0 = vmaxq_s32(acc0, output_activation_min_vec); 18670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc1 = vmaxq_s32(acc1, output_activation_min_vec); 18680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc0 = vminq_s32(acc0, output_activation_max_vec); 18690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc1 = vminq_s32(acc1, output_activation_max_vec); 18700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Saturating cast to uint8 and store to destination. 18710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t acc0_s16 = vqmovn_s32(acc0); 18720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t acc1_s16 = vqmovn_s32(acc1); 18730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); 18740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t res_u8 = vqmovun_s16(res_s16); 18750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_u8(output_ptr, res_u8); 18760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_ptr += 8; 18770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 18780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle 4 values at once. Now we're paying the full price of the 18790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // high latency of vqrdmulh. Also, storing only 4 bytes at the end 18800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // (without any alignment) can only be done 1 byte at a time. 18810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Yet, that is still worth doing to minimize the amount of leftover 18820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // that will have to go through the very slow scalar code. 18830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i <= num_output_values - 4; i += 4) { 18840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32x4_t acc = vld1q_s32(acc_buffer + i); 18850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Fixed-point multiplication. 18860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vqrdmulhq_n_s32(acc, output_multiplier); 18870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Rounding right shift. 18880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = RoundingDivideByPOT(acc, output_shift); 18890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Add the output offset. 18900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vaddq_s32(acc, output_offset_vec); 18910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Apply the activation function. 18920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vmaxq_s32(acc, output_activation_min_vec); 18930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = vminq_s32(acc, output_activation_max_vec); 18940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Saturating cast to uint8 and store to destination. 18950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x4_t acc_s16 = vqmovn_s32(acc); 18960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); 18970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const uint8x8_t res_u8 = vqmovun_s16(res_s16); 18980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_lane_u8(output_ptr + 0, res_u8, 0); 18990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_lane_u8(output_ptr + 1, res_u8, 1); 19000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_lane_u8(output_ptr + 2, res_u8, 2); 19010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle vst1_lane_u8(output_ptr + 3, res_u8, 3); 19020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_ptr += 4; 19030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif // USE_NEON 19050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 19060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle // Handle leftover values, one by one. This is very slow. 19070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle for (; i < num_output_values; i++) { 19080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 acc = acc_buffer[i]; 19090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = MultiplyByQuantizedMultiplierSmallerThanOne( 19100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc, output_multiplier, output_shift); 19110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc += output_offset; 19120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = std::max(acc, output_activation_min); 19130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle acc = std::min(acc, output_activation_max); 19140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle *output_ptr++ = static_cast<uint8>(acc); 19150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 19200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 19210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Legacy, for compatibility with old checked-in code. 19220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <FusedActivationFunctionType Ac> 19230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 19240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 input_offset, const uint8* filter_data, 19250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const Dims<4>& filter_dims, int32 filter_offset, 19260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32* bias_data, const Dims<4>& bias_dims, 19270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int stride_width, int stride_height, int pad_width, 19280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int pad_height, int depth_multiplier, int32 output_offset, 19290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_multiplier, int output_shift, 19300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_activation_min, int32 output_activation_max, 19310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle uint8* output_data, const Dims<4>& output_dims) { 19320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle if (Ac == FusedActivationFunctionType::kNone) { 19330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_EQ(output_activation_min, 0); 19340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle TFLITE_DCHECK_EQ(output_activation_max, 255); 19350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle } 19360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims, 19370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_offset, bias_data, bias_dims, stride_width, 19380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle stride_height, pad_width, pad_height, depth_multiplier, 19390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_offset, output_multiplier, output_shift, 19400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_activation_min, output_activation_max, output_data, 19410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_dims); 19420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 19430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 19440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Legacy, for compatibility with old checked-in code. 19450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <FusedActivationFunctionType Ac> 19460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, 19470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 input_offset, const uint8* filter_data, 19480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const Dims<4>& filter_dims, int32 filter_offset, 19490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const int32* bias_data, const Dims<4>& bias_dims, int stride, 19500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int pad_width, int pad_height, int depth_multiplier, 19510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_offset, int32 output_multiplier, 19520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int output_shift, int32 output_activation_min, 19530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle int32 output_activation_max, uint8* output_data, 19540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle const Dims<4>& output_dims) { 19550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data, 19560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle filter_dims, filter_offset, bias_data, bias_dims, stride, 19570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle stride, pad_width, pad_height, depth_multiplier, 19580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_offset, output_multiplier, output_shift, 19590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_activation_min, output_activation_max, output_data, 19600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle output_dims); 19610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} 19620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 19630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} // namespace optimized_ops 19640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle} // namespace tflite 19650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle 1966f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#endif // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_ 1967