10b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
20b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
30b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleLicensed under the Apache License, Version 2.0 (the "License");
40b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleyou may not use this file except in compliance with the License.
50b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleYou may obtain a copy of the License at
60b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
70b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    http://www.apache.org/licenses/LICENSE-2.0
80b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
90b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleUnless required by applicable law or agreed to in writing, software
100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selledistributed under the License is distributed on an "AS IS" BASIS,
110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew SelleSee the License for the specific language governing permissions and
130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellelimitations under the License.
140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle==============================================================================*/
15f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
16f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "fixedpoint/fixedpoint.h"
190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "public/gemmlowp.h"
200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "tensorflow/contrib/lite/kernels/internal/common.h"
210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#include "tensorflow/contrib/lite/kernels/internal/types.h"
220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellenamespace tflite {
240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellenamespace optimized_ops {
250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Implementation of quantized DepthwiseConv
270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel {};
300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON
320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 8, 2> {
340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8x2_t filter_u8;
400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8.val[0] = vld1_u8(filter_ptr);
410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter[2];
430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                            vdupq_n_s16(filter_offset));
460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4x2_t acc[2];
510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                  vget_low_s16(input_dup2.val[i]));
660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                  vget_high_s16(input_dup2.val[i]));
680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 8, 1> {
810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 2 output pixels at a time.
920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8[2];
1000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
1010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input_u8[i] = vld1_u8(input_ptr + 8 * i);
1020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 16;
1040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input[2];
1050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
1060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
1070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
1090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
1100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
1120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
1130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] =
1140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
1150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
1160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] =
1170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
1180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
1190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
1200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
1230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
1240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 1 output pixel at a time.
1250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
1260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
1270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[2];
1280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vld1q_s32(acc_buffer_ptr);
1290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
1300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
1310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
1320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
1330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
1340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
1370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
1380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
1390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
1400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc[0]);
1410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
1420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 8;
1430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
1440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
1450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
1460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
1470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
1480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 2> {
1490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
1500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
1510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
1520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
1530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
1540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
1550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
1560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
1570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
1580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
1590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 2 output pixels at a time.
1600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
1610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
1620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
1630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
1640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
1670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
1680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
1690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
1700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
1710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
1720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
1730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
1740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
1750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
1760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                   vget_low_s16(input_dup2.val[i]));
1770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
1780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                   vget_high_s16(input_dup2.val[i]));
1790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
1810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
1820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
1830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
1850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
1860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
1870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
1880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
1890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[2];
1900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
1910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
1920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
1930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
1940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
1950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
1960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
1970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
1980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
1990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
2000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
2010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
2020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
2030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
2040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4x2_t input_dup2 = vzip_s16(input, input);
2050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
2060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
2070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
2080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
2090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
2100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
2110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
2120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 8;
2130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
2140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
2150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
2160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
2170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
2180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 8> {
2190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
2200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
2210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
2220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
2230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
2240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter[2];
2250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
2260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
2270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
2280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
2290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
2300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
2310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle two output pixels at a time.
2320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
2330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
2340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[8];
2350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
2360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
2370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
2380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
2390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
2400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
2410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
2420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
2430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
2440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
2450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
2460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
2470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
2480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
2490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
2500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
2510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
2520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
2530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
2540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
2550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
2560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
2570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
2580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
2590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
2600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
2610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 32;
2620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
2630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
2640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
2650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
2660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
2670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
2680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
2690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
2700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
2710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
2720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
2730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
2740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 2;
2750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
2760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
2770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
2780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
2790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
2800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
2810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
2820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
2830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
2840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
2850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
2860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
2870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
2880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
2890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
2900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
2910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
2920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
2930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
2940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
2950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 2> {
2960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
2970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
2980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
2990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
3000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
3010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
3020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
3030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
3040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
3050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
3060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
3070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
3080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
3090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
3110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 4 output pixels at a time.
3120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 4; outp += 4) {
3130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
3140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
3150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
3160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
3170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
3200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
3210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
3220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
3230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
3240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
3250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
3260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
3270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
3280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
3290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
3300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
3310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
3320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
3330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
3340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
3360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
3370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
3380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
3390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
3400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
3410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
3430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
3440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
3450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 2;
3460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
3470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
3480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
3490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
3500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
3510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
3520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_s16(acc, filter, input_dup2);
3530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
3540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
3550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
3560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
3570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
3580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
3590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
3610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 2, 1> {
3620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
3630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
3640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
3650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
3660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
3670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
3680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
3690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
3700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
3710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
3720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
3730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
3740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
3750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
3770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 8 output pixels at a time.
3780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 8; outp += 8) {
3790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
3800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
3810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
3820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
3830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
3850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8[2];
3860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
3870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input_u8[i] = vld1_u8(input_ptr + 8 * i);
3880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 16;
3900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input[2];
3910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
3920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
3930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
3950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
3960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
3970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
3980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
3990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
4000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
4010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
4020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
4030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
4040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
4050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
4060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
4070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
4080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
4090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 4 output pixels at a time.
4100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 4; outp += 4) {
4110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
4120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[2];
4130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
4140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
4150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
4160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
4170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
4180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
4190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
4200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
4210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
4230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
4240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
4250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
4260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
4270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
4280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
4290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 8;
4300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
4310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 2 output pixels at a time.
4320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
4330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
4340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
4350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
4360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
4370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
4380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
4390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
4400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
4410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
4420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
4430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
4440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
4450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
4470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_s16(acc, filter, input);
4480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
4490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
4500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
4510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
4520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 1 output pixel at a time.
4530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
4540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
4550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x2_t acc = vld1_s32(acc_buffer_ptr);
4560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
4570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
4580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
4590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
4600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 2;
4610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
4620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
4630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
4640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
4660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
4670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
4680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1_s32(acc_buffer_ptr, acc);
4690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 2;
4700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
4710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
4720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
4730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
4750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 1, 2> {
4760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
4770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
4780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
4790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
4800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
4810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
4820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
4830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
4840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
4850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
4860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
4870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
4880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
4890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
4910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 8 output pixels at a time.
4920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 8; outp += 8) {
4930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
4940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
4950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
4960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
4970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
4980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
4990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
5000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
5010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
5020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
5030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
5040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Duplicate the input values, 2-fold
5050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
5060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
5070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
5080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
5090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
5100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
5110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
5120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
5130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
5140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
5150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
5160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
5170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
5180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
5190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
5200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x2_t acc = vld1_s32(acc_buffer_ptr);
5210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
5230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint32 input = *input_ptr++ + input_offset;
5240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
5260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
5270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
5280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1_s32(acc_buffer_ptr, acc);
5290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 2;
5300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
5310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
5320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
5330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
5350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 1, 4> {
5360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
5370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
5380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
5390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
5400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
5410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
5420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
5430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
5440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
5450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
5460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
5470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
5480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
5490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
5510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 8 output pixels at a time.
5520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 8; outp += 8) {
5530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
5540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[8];
5550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
5560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
5570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
5580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
5600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vld1_u8(input_ptr);
5610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
5620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
5630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
5640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
5660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
5670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
5680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
5690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
5700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
5710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
5720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
5730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
5740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
5760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
5770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
5780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
5790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 32;
5800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
5810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 4 output pixels at a time.
5820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 4; outp += 4) {
5830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
5840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
5850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
5860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
5870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
5880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
5890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
5900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
5910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
5920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
5930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
5940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
5950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
5960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
5970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
5980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
5990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
6010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
6020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
6030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
6040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
6050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
6070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
6080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
6090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
6100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
6110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
6120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
6130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
6140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
6150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
6160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
6180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint32 input = *input_ptr++ + input_offset;
6190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
6210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_n_s16(acc, filter, input);
6220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
6230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
6240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
6250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
6260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
6270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
6280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
6300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 1> {
6310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
6320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
6330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
6340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
6350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
6360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
6370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
6380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
6390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
6400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
6410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
6420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
6430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
6440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
6460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 4 output pixels at a time.
6470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 4; outp += 4) {
6480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
6490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
6500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
6510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
6520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
6530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
6540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input[2];
6550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
6560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
6570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
6580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
6590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
6600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 16;
6610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
6620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
6630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 0] =
6640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
6650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 1] =
6660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
6670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
6680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
6690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
6700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
6710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
6720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
6730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
6740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
6750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
6760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
6770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc;
6780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vld1q_s32(acc_buffer_ptr);
6790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
6810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
6820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
6830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
6840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
6850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
6860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
6870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
6880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
6890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
6900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
6910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_s16(acc, filter, input);
6920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
6930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
6940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
6950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
6960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
6970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
6980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
6990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
7000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 4, 4> {
7010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
7020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
7030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
7040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
7050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
7060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter[2];
7070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
7080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
7090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
7100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
7110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
7120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
7140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 2 output pixels at a time.
7150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
7160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
7170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[8];
7180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
7190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
7200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
7210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
7230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vld1_u8(input_ptr);
7240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 8;
7250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
7260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
7270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
7290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
7300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_low_s16(input), 0);
7310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
7320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_low_s16(input), 1);
7330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
7340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_low_s16(input), 2);
7350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
7360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_low_s16(input), 3);
7370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
7380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_high_s16(input), 0);
7390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
7400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_high_s16(input), 1);
7410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
7420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_high_s16(input), 2);
7430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
7440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                              vget_high_s16(input), 3);
7450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
7460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 8; i++) {
7470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
7480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
7490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 32;
7500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
7510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
7520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
7530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
7540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
7550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
7560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
7570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
7580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
7600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
7610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
7620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
7630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
7640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
7650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += 4;
7660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
7670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
7680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
7690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
7710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
7720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
7730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
7740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
7750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
7760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
7770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
7780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
7790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
7800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
7810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
7820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
7830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
7840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
7850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 3> {
7860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
7870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
7880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
7890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
7900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // We will have to duplicate bytes in a NEON register, 3-fold.
7910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // We will do that by register-level table-look-up using VTBL instructions.
7920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Here we prepare the registers containing the table-lookup indices.
7930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
7940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                                   {2, 3, 3, 3, 4, 4, 4, 5},
7950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                                   {5, 5, 6, 6, 6, 7, 7, 7}};
7960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t dup3_indices[3];
7970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 3; i++) {
7980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
7990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
8000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
8010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
8020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
8030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_filter_ptr = filter_ptr;
8040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_input_ptr = input_ptr;
8050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int ic = 0;
8060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle 8 input channels at a time.
8070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic <= input_depth - 8; ic += 8) {
8080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the filters, add filter_offset.
8090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t filter[3];
8100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8x3_t filter_u8;
8110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_u8.val[0] = vld1_u8(local_filter_ptr);
8120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
8130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
8140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 24;
8150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 3; i++) {
8160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t filter_s16 =
8170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
8180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
8190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the inputs, duplicate 3-fold, add input_offset.
8210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
8220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_input_ptr += 8;
8230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
8240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8_t input_u8_dup3[3];
8250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 3; i++) {
8260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
8270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t input_dup3[3];
8290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 3; i++) {
8300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t input_s16_dup3 =
8310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
8320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
8330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the accumulators from acc_buffer
8350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4x3_t acc[2];
8360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
8370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
8380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
8390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
8400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Multiply-accumulate
8420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int j = 0; j < 3; j++) {
8430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
8440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    vget_low_s16(filter[j]));
8450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
8460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    vget_high_s16(filter[j]));
8470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Store the accumulators back to acc_buffer
8490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
8500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
8510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
8520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
8530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer_ptr += 24;
8550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
8560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle one input channel at a time.
8570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic < input_depth; ic++) {
8580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16 input_val = *local_input_ptr++ + input_offset;
8590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 3; i++) {
8600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16 filter_val = local_filter_ptr[i] + filter_offset;
8610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
8620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 3;
8640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
8650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
8660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
8670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
8680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
8690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
8700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
8710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 2> {
8720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
8730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
8740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
8750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
8760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
8770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
8780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_filter_ptr = filter_ptr;
8790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_input_ptr = input_ptr;
8800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int ic = 0;
8810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle 8 input channels at a time.
8820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic <= input_depth - 8; ic += 8) {
8830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the filters, add filter_offset.
8840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t filter[2];
8850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8x2_t filter_u8;
8860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_u8.val[0] = vld1_u8(local_filter_ptr);
8870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
8880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 16;
8890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
8900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t filter_s16 =
8910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
8920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
8930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
8940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the inputs, add input_offset, duplicate 2-fold.
8950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
8960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_input_ptr += 8;
8970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
8980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
8990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
9000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the accumulators from acc_buffer.
9010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4x2_t acc[2];
9020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
9030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
9040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
9050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
9060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Multiply-accumulate.
9070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int j = 0; j < 2; j++) {
9080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
9090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    vget_low_s16(input_dup2.val[j]));
9100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
9110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    vget_high_s16(input_dup2.val[j]));
9120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
9130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Store the accumulators back to acc_buffer.
9140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
9150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
9160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
9170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
9180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer_ptr += 16;
9190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
9200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle one input channel at a time.
9210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic < input_depth; ic++) {
9220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the inputs.
9230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16 input_val = *local_input_ptr++ + input_offset;
9240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
9250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16 filter_val = local_filter_ptr[i] + filter_offset;
9260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
9270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
9280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 2;
9290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
9300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
9310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
9320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
9330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
9340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
9350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
9360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 0, 1> {
9370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
9380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
9390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
9400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
9410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
9420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
9430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_filter_ptr = filter_ptr;
9440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* local_input_ptr = input_ptr;
9450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int ic = 0;
9460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle 16 input channels at a time.
9470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic <= input_depth - 16; ic += 16) {
9480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the filters, add filter_offset.
9490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8_t filter_u8_0 = vld1_u8(local_filter_ptr + 8 * 0);
9500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8_t filter_u8_1 = vld1_u8(local_filter_ptr + 8 * 1);
9510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 16;
9520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
9530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
9540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
9550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
9560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the inputs, add input_offset.
9570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8_t input_u8_0 = vld1_u8(local_input_ptr + 8 * 0);
9580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        uint8x8_t input_u8_1 = vld1_u8(local_input_ptr + 8 * 1);
9590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_input_ptr += 16;
9600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
9610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
9620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
9630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
9640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the accumulators from acc_buffer
9650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
9660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
9670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
9680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
9690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
9700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_1 =
9710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
9720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
9730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_3 =
9740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
9750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Store the accumulators back to acc_buffer
9760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
9770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
9780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
9790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
9800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer_ptr += 16;
9810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
9820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle 8 input channels at a time.
9830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic <= input_depth - 8; ic += 8) {
9840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the filters, add filter_offset.
9850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
9860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_filter_ptr += 8;
9870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
9880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t filter =
9890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
9900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the inputs, add input_offset.
9910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
9920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        local_input_ptr += 8;
9930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
9940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
9950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Load the accumulators from acc_buffer
9960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int32x4_t acc[2];
9970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
9980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
9990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
10000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Multiply-accumulate
10010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
10020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
10030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Store the accumulators back to acc_buffer
10040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int i = 0; i < 2; i++) {
10050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
10060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
10070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer_ptr += 8;
10080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Handle one input channel at a time.
10100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (; ic < input_depth; ic++) {
10110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16 input_val = *local_input_ptr++ + input_offset;
10120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16 filter_val = *local_filter_ptr++ + filter_offset;
10130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
10140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
10160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
10170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
10180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
10190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
10200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
10210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 16, 1> {
10220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
10230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
10240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
10250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
10260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
10270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8[2];
10280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
10290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
10300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
10310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter[2];
10320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
10330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
10340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
10350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
10360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
10370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
10380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
10390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
10400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
10410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8[2];
10420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
10430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input_u8[i] = vld1_u8(input_ptr + 8 * i);
10440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
10460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input[2];
10470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
10480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
10490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
10510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
10520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
10540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
10550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
10560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
10570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
10590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
10600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
10610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                   vget_low_s16(filter[i]));
10620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
10630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                   vget_high_s16(filter[i]));
10640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
10660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
10670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
10680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
10700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
10710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
10720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
10730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
10740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
10750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 8, 1> {
10760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
10770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
10780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
10790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
10800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
10810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
10820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
10830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
10840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
10850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
10860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
10870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8x8_t input_u8 = vld1_u8(input_ptr);
10880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
10890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
10900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
10910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[2];
10920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
10930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
10940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
10950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
10960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
10970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
10980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
10990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
11000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
11010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
11020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 8;
11030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
11040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
11050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
11060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
11070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
11080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
11090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 16> {
11100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
11110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
11120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
11130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
11140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
11150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8[2];
11160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
11170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
11180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
11190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter[2];
11200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
11210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
11220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
11230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int i = 0; i < 2; i++) {
11240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
11250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
11260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
11270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
11280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8 input_u8 = *input_ptr;
11290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
11300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16 input = static_cast<int16>(input_u8 + input_offset);
11310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
11320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[4];
11330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
11340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
11350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
11360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
11370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
11380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 0] =
11390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
11400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[2 * i + 1] =
11410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
11420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
11430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
11440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 4; i++) {
11450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
11460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
11470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 16;
11480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
11490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
11500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
11510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
11520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
11530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 32> {
11540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
11550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
11560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
11570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
11580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
11590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
11600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
11610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_2 = vld1_u8(filter_ptr + 8 * 2);
11620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_3 = vld1_u8(filter_ptr + 8 * 3);
11630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
11640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
11650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_2 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_2));
11660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_3 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_3));
11670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
11680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
11690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_2 = vaddq_s16(filter_2, vdupq_n_s16(filter_offset));
11700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_3 = vaddq_s16(filter_3, vdupq_n_s16(filter_offset));
11710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
11720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
11730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8 input_u8 = *input_ptr;
11740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
11750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16 input = static_cast<int16>(input_u8 + input_offset);
11760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
11770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
11780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
11790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
11800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
11810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
11820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
11830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
11840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
11850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
11860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
11870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
11880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
11890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
11900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
11910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
11920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
11930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
11940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
11950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
11960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
11970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
11980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
11990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
12000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
12010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
12020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
12030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 32;
12040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
12050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
12060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
12070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
12080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
12094d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlowerstruct QuantizedDepthwiseConvKernel<true, 1, 20> {
12104d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
12114d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower                  const uint8* input_ptr, int16 input_offset,
12124d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower                  int input_ptr_increment, const uint8* filter_ptr,
12134d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower                  int16 filter_offset, int32* acc_buffer_ptr) {
12144d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // Load the filters, add filter_offset.
12154d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
12164d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // We load the first 16 bytes into filter_u8_{0,1} as usual.
12174d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // Then we load the 8 last bytes into filter_u8_x  (x for 'extra').
12184d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // This is redundant: the first 4 bytes of filter_u8_x are the same
12194d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // as the last 4 bytes of filter_u8_x.
12204d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr + 8 * 0);
12214d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 8 * 1);
12224d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    uint8x8_t filter_u8_x = vld1_u8(filter_ptr + 8 * 1 + 4);
12234d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    int16x8_t filter_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
12244d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    int16x8_t filter_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
12254d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    int16x8_t filter_x = vreinterpretq_s16_u16(vmovl_u8(filter_u8_x));
12264d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    filter_0 = vaddq_s16(filter_0, vdupq_n_s16(filter_offset));
12274d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    filter_1 = vaddq_s16(filter_1, vdupq_n_s16(filter_offset));
12284d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    filter_x = vaddq_s16(filter_x, vdupq_n_s16(filter_offset));
12294d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    // Handle one output pixel at a time.
12304d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    for (int outp = 0; outp < num_output_pixels; outp++) {
12314d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      uint8 input_u8 = *input_ptr;
12324d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      input_ptr += input_ptr_increment;
12334d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int16 input = static_cast<int16>(input_u8 + input_offset);
12344d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      // Load the accumulators from acc_buffer
12354d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
12364d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
12374d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
12384d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
12394d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
12404d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      // Multiply-accumulate
12414d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
12424d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
12434d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
12444d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
12454d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
12464d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      // Store the accumulators back to acc_buffer
12474d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
12484d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
12494d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
12504d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
12514d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
12524d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower      acc_buffer_ptr += 20;
12534d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower    }
12544d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower  }
12554d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower};
12564d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower
12574d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlowertemplate <>
12580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 1, 8> {
12590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
12600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
12610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
12620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
12630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
12640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
12650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x8_t filter = vaddq_s16(
12660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
12670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
12680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
12690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8 input_u8 = *input_ptr;
12700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
12710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16 input = static_cast<int16>(input_u8 + input_offset);
12720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
12730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc[2];
12740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
12750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
12760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
12770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
12780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
12790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
12800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
12810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int i = 0; i < 2; i++) {
12820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
12830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
12840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 8;
12850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
12860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
12870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
12880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
12890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
12900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 2, 1> {
12910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
12920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
12930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
12940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
12950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
12960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
12970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
12980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
12990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
13000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
13010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
13020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
13030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
13040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
13060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 2 output pixels at a time.
13080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp <= num_output_pixels - 2; outp += 2) {
13090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
13100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
13110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
13120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint16x4_t input_u16 = vdup_n_u16(0);
13130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
13140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                input_u16, 0);
13150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
13160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u16 = vset_lane_u16((reinterpret_cast<const uint16*>(input_ptr))[0],
13170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                input_u16, 1);
13180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
13190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 = vreinterpret_s16_u16(
13200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
13210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
13220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
13240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_s16(acc, filter, input);
13250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
13260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
13270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
13280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
13290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle 1 output pixel at a time.
13310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels; outp++) {
13320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer.
13330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x2_t acc = vld1_s32(acc_buffer_ptr);
13340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
13350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vdup_n_u8(0);
13360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
13370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
13380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
13390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
13400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
13410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
13420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate.
13440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
13450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer.
13460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1_s32(acc_buffer_ptr, acc);
13470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 2;
13480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
13490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
13500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
13510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
13530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<true, 4, 1> {
13540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
13550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
13560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
13570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
13580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    if (num_output_pixels <= 0) {
13590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      return;
13600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
13610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
13630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8 = vdup_n_u8(0);
13640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
13650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
13660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
13670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
13680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter_s16 =
13690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
13700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
13710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int outp = 0;
13730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time until second to the last pixel. Second
13750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // to the last because we read eight input pixels while only processing
13760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // four.
13770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; outp < num_output_pixels - 1; outp++) {
13780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
13790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc;
13800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vld1q_s32(acc_buffer_ptr);
13810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
13830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8 = vld1_u8(input_ptr);
13840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
13850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input_s16 =
13860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
13870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
13880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
13890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc = vmlal_s16(acc, filter, input);
13900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
13910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr, acc);
13920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 4;
13930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
13940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
13950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle the last output pixel.
13960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the accumulators from acc_buffer
13970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int32x4_t acc;
13980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    acc = vld1q_s32(acc_buffer_ptr);
13990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the inputs, add input_offset.
14010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t input_u8 = vdup_n_u8(0);
14020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
14030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
14040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
14050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
14060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t input_s16 =
14070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
14080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
14090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Multiply-accumulate
14100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    acc = vmlal_s16(acc, filter, input);
14110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Store the accumulators back to acc_buffer
14120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    vst1q_s32(acc_buffer_ptr, acc);
14130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
14140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
14150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <>
14170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellestruct QuantizedDepthwiseConvKernel<false, 12, 1> {
14180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
14190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  const uint8* input_ptr, int16 input_offset,
14200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int input_ptr_increment, const uint8* filter_ptr,
14210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  int16 filter_offset, int32* acc_buffer_ptr) {
14220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Load the filters, add filter_offset.
14230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_0 = vld1_u8(filter_ptr);
14240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    uint8x8_t filter_u8_1 = vld1_u8(filter_ptr + 4);
14250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_s16_0 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_0));
14260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x8_t filter_s16_1 = vreinterpretq_s16_u16(vmovl_u8(filter_u8_1));
14270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_s16_0 = vaddq_s16(filter_s16_0, vdupq_n_s16(filter_offset));
14280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_s16_1 = vaddq_s16(filter_s16_1, vdupq_n_s16(filter_offset));
14290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
14300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
14310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
14320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // Handle one output pixel at a time.
14340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int outp = 0; outp < num_output_pixels; outp++) {
14350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the inputs, add input_offset.
14360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8_0 = vld1_u8(input_ptr);
14370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      uint8x8_t input_u8_1 = vld1_u8(input_ptr + 4);
14380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
14390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input_0 = vreinterpretq_s16_u16(vmovl_u8(input_u8_0));
14400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int16x8_t input_1 = vreinterpretq_s16_u16(vmovl_u8(input_u8_1));
14410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
14420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
14430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Load the accumulators from acc_buffer
14450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
14460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
14470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
14480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Multiply-accumulate
14500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
14510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
14520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
14530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      // Store the accumulators back to acc_buffer
14550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
14560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
14570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
14580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      acc_buffer_ptr += 12;
14600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
14610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
14620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle};
14630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif
14640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
14650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Accumulates the effect of one row of the filter, on a segment of one row
14660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// of the output, accessing the corresponding one row of the input.
14670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
14680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid QuantizedDepthwiseConvAccumRow(
14690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int stride, int input_depth, int input_width, const uint8* input_data,
14700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
14710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
14720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
14730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef GEMMLOWP_PROFILING
14740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
14750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif
14760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // Sanity check parameters. This is important in particular to ensure
14770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // that we keep the number of template instantiations minimal, so we don't
14780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // increase binary size unnecessarily.
14790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
14800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static_assert(kFixedInputDepth || kAllowStrided, "");
14810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK(stride == 1 || kAllowStrided);
14820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (kFixedInputDepth) {
14830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
14840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
14850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (kFixedDepthMultiplier) {
14860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
14870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
14880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
14890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int input_ptr_increment = stride * input_depth;
14900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const uint8* filter_base_ptr = filter_data;
14910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
14920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // For the current (filter_x, filter_y) point in the filter,
14930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // compute the boundaries of the corresponding output row segment.
14940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int out_x_loop_start_unclampled = 0;
14950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int out_x_loop_end_unclampled = 0;
14960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    if (kAllowStrided) {
14970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      if (stride == 2) {
14980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
14990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_end_unclampled =
15000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            (pad_width + input_width - filter_x + 1) / 2;
15010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      } else if (stride == 4) {
15020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
15030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_end_unclampled =
15040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            (pad_width + input_width - filter_x + 3) / 4;
15050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      } else {
15060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_start_unclampled =
15070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            (pad_width - filter_x + stride - 1) / stride;
15080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_loop_end_unclampled =
15090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            (pad_width + input_width - filter_x + stride - 1) / stride;
15100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
15110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    } else {
15120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      out_x_loop_start_unclampled = pad_width - filter_x;
15130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
15140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
15150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // The kernel will have to iterate on the segment of the
15160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    // output row that starts at out_x_loop_start and out_x_loop_end.
15170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int out_x_loop_start =
15180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
15190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int out_x_loop_end =
15200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
15210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
15220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int32* acc_buffer_ptr =
15230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
15240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
15250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8* input_ptr = input_data + in_x_origin * input_depth;
15260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
15270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    QuantizedDepthwiseConvKernel<
15280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        kAllowStrided, kFixedInputDepth,
15290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
15300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    depth_multiplier, input_ptr, input_offset,
15310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    input_ptr_increment, filter_base_ptr,
15320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                    filter_offset, acc_buffer_ptr);
15330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_base_ptr += output_depth;
15340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
15350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
15360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
15370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
15380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void QuantizedDepthwiseConvAccumRowGeneric(
15390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int stride, int input_depth, int input_width, const uint8* input_data,
15400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
15410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
15420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
15430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
15440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
15450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifndef ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
15460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  LOG(FATAL)
15470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "\n\n"
15480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "*****************************************************************\n"
15490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* This tfmini inference code was about to use the slow generic\n"
15500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* fallback implementation for a DepthwiseConv op, and we want you\n"
15510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* to be aware of that so that you will know why you get terrible\n"
15520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* performance.\n"
15530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "*\n"
15540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* If you would like to carry on with the slow code, compile\n"
15550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* with this preprocessor token defined:\n"
15562b7d03c91d092cda88e6db345705fff3cd5b7b77A. Unique TensorFlower      << "* ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK.\n"
15570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "*\n"
15580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* The right thing to do, if you care about performance, is to add\n"
15590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* a new DepthwiseConv kernel to tfmini to cover your case.\n"
15600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* The relevant parameters defining your case are:\n"
15610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* stride = " << stride << "\n"
15620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* input_depth = " << input_depth << "\n"
15630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* depth_multiplier = " << depth_multiplier << "\n"
15640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "*\n"
15650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* Please do not hesitate to contact benoitjacob@ with this\n"
15660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "* information.\n"
15670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      << "*****************************************************************\n";
15680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif  // ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
15690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif  // TFLITE_PREVENT_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
15700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const uint8* filter_base_ptr = filter_data;
15710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
15720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int out_x_loop_start = std::max(
15730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
15740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int out_x_loop_end =
15750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        std::min(out_x_buffer_end,
15760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                 (pad_width + input_width - filter_x + stride - 1) / stride);
15770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
15780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int32* acc_buffer_ptr =
15790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
15800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
15810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const uint8* input_ptr = input_data + in_x_origin * input_depth;
15820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int input_ptr_increment = (stride - 1) * input_depth;
15830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
15840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const uint8* filter_ptr = filter_base_ptr;
15850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int ic = 0; ic < input_depth; ++ic) {
15860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int16 input_val = *input_ptr++ + input_offset;
15870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int m = 0; m < depth_multiplier; m++) {
15880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16 filter_val = *filter_ptr++ + filter_offset;
15890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
15900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
15910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
15920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      input_ptr += input_ptr_increment;
15930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
15940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    filter_base_ptr += output_depth;
15950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
15960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
15970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
15980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Initializes the accumulator buffer with bias values.
15990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
16000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                       const int32* bias_data,
16010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                       int32* acc_buffer) {
16020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  int i = 0;
16030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON
16040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (output_depth == 1) {
16050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b = vdupq_n_s32(bias_data[0]);
16060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 16; i += 16) {
16070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + i + 0, b);
16080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + i + 4, b);
16090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + i + 8, b);
16100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + i + 12, b);
16110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 4; i += 4) {
16130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + i, b);
16140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  } else if (output_depth == 2) {
16160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    int32x4_t b = vdupq_n_s32(bias_data[0]);
16170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    b = vsetq_lane_s32(bias_data[1], b, 1);
16180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    b = vsetq_lane_s32(bias_data[1], b, 3);
16190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 8; i += 8) {
16200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 2 * i + 0, b);
16210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 2 * i + 4, b);
16220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 2 * i + 8, b);
16230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 2 * i + 12, b);
16240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 2; i += 2) {
16260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 2 * i, b);
16270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  } else if (output_depth == 4) {
16290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b = vld1q_s32(bias_data);
16300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 4; i += 4) {
16310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 4 * i + 0, b);
16320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 4 * i + 4, b);
16330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 4 * i + 8, b);
16340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 4 * i + 12, b);
16350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i < num_output_pixels; i++) {
16370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 4 * i, b);
16380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  } else if (output_depth == 8) {
16400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b0 = vld1q_s32(bias_data);
16410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b1 = vld1q_s32(bias_data + 4);
16420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i <= num_output_pixels - 2; i += 2) {
16430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 0, b0);
16440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 4, b1);
16450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 8, b0);
16460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 12, b1);
16470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i < num_output_pixels; i++) {
16490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 0, b0);
16500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 8 * i + 4, b1);
16510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  } else if (output_depth == 16) {
16530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b0 = vld1q_s32(bias_data);
16540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b1 = vld1q_s32(bias_data + 4);
16550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b2 = vld1q_s32(bias_data + 8);
16560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    const int32x4_t b3 = vld1q_s32(bias_data + 12);
16570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (; i < num_output_pixels; i++) {
16580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 16 * i + 0, b0);
16590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 16 * i + 4, b1);
16600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 16 * i + 8, b2);
16610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      vst1q_s32(acc_buffer + 16 * i + 12, b3);
16620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
16630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
16640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif
16650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  for (; i < num_output_pixels; i++) {
16660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    memcpy(acc_buffer + i * output_depth, bias_data,
16670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle           sizeof(acc_buffer[0]) * output_depth);
16680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
16690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
16700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
16710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selleinline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
16720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int32 input_offset, const uint8* filter_data,
16730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          const Dims<4>& filter_dims, int32 filter_offset,
16740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          const int32* bias_data, const Dims<4>& bias_dims,
16750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int stride_width, int stride_height, int pad_width,
16760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int pad_height, int depth_multiplier,
16770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int32 output_offset, int32 output_multiplier,
16780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int output_shift, int32 output_activation_min,
16790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          int32 output_activation_max, uint8* output_data,
16800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                          const Dims<4>& output_dims) {
16810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
16820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
16830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
16840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
16850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
16860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int input_height = ArraySize(input_dims, 2);
16870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int input_width = ArraySize(input_dims, 1);
16880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int input_depth = ArraySize(input_dims, 0);
16890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int filter_height = ArraySize(filter_dims, 2);
16900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int filter_width = ArraySize(filter_dims, 1);
16910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int output_height = ArraySize(output_dims, 2);
16920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int output_width = ArraySize(output_dims, 1);
16930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
16940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
16950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  static const int kAccBufferMaxSize = 2048;
16960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  int32 acc_buffer[kAccBufferMaxSize];
16970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
16980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
16990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
17000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
17010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   kAccBufferActualSize);
17020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
17030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
17040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // row_accum_func will point to the core accumulation function to be used
17060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // for this DepthwiseConv op.
17070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
17080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  row_accum_func_t row_accum_func = nullptr;
17090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
17110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                        FIXED_DEPTH_MULTIPLIER)           \
17120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
17130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
17140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
17150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    row_accum_func =                                                      \
17160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
17170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                       FIXED_DEPTH_MULTIPLIER>;           \
17180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
17190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON
17210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // We go over our list of kernels by decreasing order of preference
17220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // for the cases where multiple kernels could apply.
17230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
17250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
17270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
17280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
17290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
17300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
17310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
17320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
17330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
17340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
17350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
17360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // Next come the strided kernels: AllowStrided=true, fixed input depth.
17380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // They are a bit less efficient, but allow stride!=1.
17390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
17410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
17420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
17434d90af18b92eb804bce2c334e718fddc691df28eA. Unique TensorFlower  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
17440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
17450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
17460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
17470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
17480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
17490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // Finally, the kernels allowing a variable input depth,
17510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // these are the least efficient but most general kernels.
17520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
17540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
17550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
17560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif  // USE_NEON
17570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // No matching fast kernel found, use slow fallback.
17590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (!row_accum_func) {
17600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
17610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
17620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#undef TFMINI_USE_DEPTHWISECONV_KERNEL
17640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
17650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  // Now that we have determined row_accum_func, we can start work.
17660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  uint8* output_ptr = output_data;
17670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  for (int b = 0; b < batches; ++b) {
17680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    for (int out_y = 0; out_y < output_height; ++out_y) {
17690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int in_y_origin = (out_y * stride_height) - pad_height;
17700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int filter_y_start = std::max(0, -in_y_origin);
17710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      const int filter_y_end =
17720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          std::min(filter_height, input_height - in_y_origin);
17730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
17740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle           out_x_buffer_start += kOutputPixelsInAccBuffer) {
17750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int out_x_buffer_end = std::min(
17760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
17770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // We call a 'pixel' a group of activation that share all but the
17780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // 'depth'/'channel' coordinate. num_output_pixels is the number of
17790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // output pixels that we will accumulate in this loop iteration.
17800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
17810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Initialize our local accumulator with the bias values, so we don't
17820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // have to add them later.
17830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
17840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                                   acc_buffer);
17850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Accumulation loop. Most of the time should be spent in here.
17860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (int filter_y = filter_y_start; filter_y < filter_y_end;
17870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle             ++filter_y) {
17880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int in_y = in_y_origin + filter_y;
17890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          row_accum_func(
17900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              stride_width, input_depth, input_width,
17910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              input_data + in_y * input_dims.strides[2] +
17920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                  b * input_dims.strides[3],
17930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              input_offset, pad_width, depth_multiplier, filter_width,
17940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              filter_data + filter_y * filter_dims.strides[2], filter_offset,
17950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
17960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
17970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Finished accumulating int32 values. Now need to convert them to
17980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // the final 8bit form and store them.
17990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        gemmlowp::ScopedProfilingLabel label("downquantize+store");
18000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int num_output_values = output_depth * num_output_pixels;
18010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        int i = 0;
18020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#ifdef USE_NEON
18030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        using gemmlowp::RoundingDivideByPOT;
18040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
18050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int32x4_t output_activation_min_vec =
18060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vdupq_n_s32(output_activation_min);
18070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        const int32x4_t output_activation_max_vec =
18080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            vdupq_n_s32(output_activation_max);
18090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Handle 16 values at once.
18100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // This allows us to issue 4 mutually independent int32
18110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // multiplications (vqrdmulh), which should alleviate most of their
18120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // high latency.
18130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (; i <= num_output_values - 16; i += 16) {
18140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int32x4_t acc[4];
18150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
18170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
18190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Fixed-point multiplication.
18200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
18220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = RoundingDivideByPOT(acc[j], output_shift);
18250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Add the output offset.
18270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = vaddq_s32(acc[j], output_offset_vec);
18290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Apply the activation function.
18310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
18330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc[j] = vminq_s32(acc[j], output_activation_max_vec);
18360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Saturating cast to uint8 and store to destination.
18380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int16x4_t acc_s16[4];
18390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          for (int j = 0; j < 4; j++) {
18400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle            acc_s16[j] = vqmovn_s32(acc[j]);
18410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          }
18420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
18430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
18440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
18450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
18460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
18470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          output_ptr += 16;
18480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
18490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Handle 8 values at once.
18500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Not as good as 16 (now we're only issuing 2 mutually independent
18510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // vqrdmulh instructions, so we're probably paying for their high
18520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // latency).
18530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (; i <= num_output_values - 8; i += 8) {
18540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
18550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
18560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Fixed-point multiplication.
18570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
18580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
18590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Rounding right shift.
18600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc0 = RoundingDivideByPOT(acc0, output_shift);
18610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc1 = RoundingDivideByPOT(acc1, output_shift);
18620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Add the output offset.
18630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc0 = vaddq_s32(acc0, output_offset_vec);
18640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc1 = vaddq_s32(acc1, output_offset_vec);
18650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Apply the activation function.
18660b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc0 = vmaxq_s32(acc0, output_activation_min_vec);
18670b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc1 = vmaxq_s32(acc1, output_activation_min_vec);
18680b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc0 = vminq_s32(acc0, output_activation_max_vec);
18690b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc1 = vminq_s32(acc1, output_activation_max_vec);
18700b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Saturating cast to uint8 and store to destination.
18710b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
18720b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
18730b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
18740b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
18750b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1_u8(output_ptr, res_u8);
18760b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          output_ptr += 8;
18770b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
18780b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Handle 4 values at once. Now we're paying the full price of the
18790b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
18800b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // (without any alignment) can only be done 1 byte at a time.
18810b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Yet, that is still worth doing to minimize the amount of leftover
18820b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // that will have to go through the very slow scalar code.
18830b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (; i <= num_output_values - 4; i += 4) {
18840b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int32x4_t acc = vld1q_s32(acc_buffer + i);
18850b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Fixed-point multiplication.
18860b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = vqrdmulhq_n_s32(acc, output_multiplier);
18870b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Rounding right shift.
18880b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = RoundingDivideByPOT(acc, output_shift);
18890b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Add the output offset.
18900b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = vaddq_s32(acc, output_offset_vec);
18910b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Apply the activation function.
18920b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = vmaxq_s32(acc, output_activation_min_vec);
18930b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = vminq_s32(acc, output_activation_max_vec);
18940b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          // Saturating cast to uint8 and store to destination.
18950b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x4_t acc_s16 = vqmovn_s32(acc);
18960b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
18970b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
18980b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1_lane_u8(output_ptr + 0, res_u8, 0);
18990b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1_lane_u8(output_ptr + 1, res_u8, 1);
19000b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1_lane_u8(output_ptr + 2, res_u8, 2);
19010b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          vst1_lane_u8(output_ptr + 3, res_u8, 3);
19020b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          output_ptr += 4;
19030b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
19040b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle#endif  // USE_NEON
19050b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
19060b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        // Handle leftover values, one by one. This is very slow.
19070b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        for (; i < num_output_values; i++) {
19080b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          int32 acc = acc_buffer[i];
19090b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
19100b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle              acc, output_multiplier, output_shift);
19110b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc += output_offset;
19120b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = std::max(acc, output_activation_min);
19130b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          acc = std::min(acc, output_activation_max);
19140b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle          *output_ptr++ = static_cast<uint8>(acc);
19150b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle        }
19160b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle      }
19170b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    }
19180b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
19190b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
19200b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
19210b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Legacy, for compatibility with old checked-in code.
19220b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <FusedActivationFunctionType Ac>
19230b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
19240b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 input_offset, const uint8* filter_data,
19250b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   const Dims<4>& filter_dims, int32 filter_offset,
19260b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   const int32* bias_data, const Dims<4>& bias_dims,
19270b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int stride_width, int stride_height, int pad_width,
19280b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int pad_height, int depth_multiplier, int32 output_offset,
19290b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 output_multiplier, int output_shift,
19300b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 output_activation_min, int32 output_activation_max,
19310b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   uint8* output_data, const Dims<4>& output_dims) {
19320b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  if (Ac == FusedActivationFunctionType::kNone) {
19330b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    TFLITE_DCHECK_EQ(output_activation_min, 0);
19340b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle    TFLITE_DCHECK_EQ(output_activation_max, 255);
19350b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  }
19360b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  DepthwiseConv(input_data, input_dims, input_offset, filter_data, filter_dims,
19370b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                filter_offset, bias_data, bias_dims, stride_width,
19380b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                stride_height, pad_width, pad_height, depth_multiplier,
19390b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                output_offset, output_multiplier, output_shift,
19400b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                output_activation_min, output_activation_max, output_data,
19410b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                output_dims);
19420b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
19430b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
19440b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle// Legacy, for compatibility with old checked-in code.
19450b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selletemplate <FusedActivationFunctionType Ac>
19460b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Sellevoid DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
19470b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 input_offset, const uint8* filter_data,
19480b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   const Dims<4>& filter_dims, int32 filter_offset,
19490b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   const int32* bias_data, const Dims<4>& bias_dims, int stride,
19500b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int pad_width, int pad_height, int depth_multiplier,
19510b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 output_offset, int32 output_multiplier,
19520b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int output_shift, int32 output_activation_min,
19530b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   int32 output_activation_max, uint8* output_data,
19540b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                   const Dims<4>& output_dims) {
19550b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle  DepthwiseConv<Ac>(input_data, input_dims, input_offset, filter_data,
19560b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                    filter_dims, filter_offset, bias_data, bias_dims, stride,
19570b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                    stride, pad_width, pad_height, depth_multiplier,
19580b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                    output_offset, output_multiplier, output_shift,
19590b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                    output_activation_min, output_activation_max, output_data,
19600b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle                    output_dims);
19610b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}
19620b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
19630b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}  // namespace optimized_ops
19640b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle}  // namespace tflite
19650b15439f8f0f2d4755587f4096c3ea04cb199d23Andrew Selle
1966f8347ceebbad0e06552633fcdf8e63f52246ba62Sanjoy Das#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
1967