16b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org/*
26b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
36b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *
46b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  Use of this source code is governed by a BSD-style license
56b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  that can be found in the LICENSE file in the root of the source
66b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  tree. An additional intellectual property rights grant can be found
76b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  in the file PATENTS.  All contributing project authors may
86b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org *  be found in the AUTHORS file in the root of the source tree.
96b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org */
106b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
116b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org/* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
126b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
136b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org * entropy_coding.c. Results are bit exact with the c code for
146b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org * generic platforms.
156b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org */
166b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
176b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org#include "entropy_coding.h"
186b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
196b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org#include <arm_neon.h>
206b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org#include <assert.h>
216b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org#include <stddef.h>
226b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
236b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org#include "signal_processing_library.h"
246b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
256b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.orgvoid WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
266b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int32_t matrix1[],
276b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      int32_t matrix_product[],
286b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix1_index_factor1,
296b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix0_index_factor1,
306b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix1_index_init_case,
316b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix1_index_step,
326b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix0_index_step,
336b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int inner_loop_count,
346b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int mid_loop_count,
356b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int shift) {
366b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int j = 0, k = 0, n = 0;
376b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
386b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int* matrix1_index_factor2 = &j;
396b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int* matrix0_index_factor2 = &k;
406b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  if (matrix1_index_init_case != 0) {
416b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    matrix1_index_factor2 = &k;
426b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    matrix0_index_factor2 = &j;
436b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
446b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int32x4_t shift32x4 = vdupq_n_s32(shift);
456b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int32x2_t shift32x2 = vdup_n_s32(shift);
460c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org  int32x4_t sum_32x4 =  vdupq_n_s32(0);
470c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org  int32x2_t sum_32x2 =  vdup_n_s32(0);
486b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
496b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  assert(inner_loop_count % 2 == 0);
506b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  assert(mid_loop_count % 2 == 0);
516b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
526b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
536b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    for (j = 0; j < SUBFRAMES; j++) {
546b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix_prod_index = mid_loop_count * j;
556b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
560c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
576b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = k;
586b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = matrix0_index_factor1 * j;
596b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < inner_loop_count; n++) {
606b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix0_32x4 =
616b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
626b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix1_32x4 =
636b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
646b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
656b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
666b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += matrix1_index_step;
676b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += matrix0_index_step;
686b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
696b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
706b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index += 4;
716b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
726b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      if (mid_loop_count % 4 > 1) {
730c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org        sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
746b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = k;
756b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        k += 2;
766b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = matrix0_index_factor1 * j;
776b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < inner_loop_count; n++) {
786b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x2_t matrix0_32x2 =
796b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
806b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x2_t matrix1_32x2 =
816b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
826b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
836b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
846b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += matrix1_index_step;
856b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += matrix0_index_step;
866b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
876b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
886b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index += 2;
896b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
906b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    }
916b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
926b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
930c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org    int32x2_t multi_32x2 = vdup_n_s32(0);
940c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org    int32x2_t matrix0_32x2 = vdup_n_s32(0);
956b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    for (j = 0; j < SUBFRAMES; j++) {
966b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix_prod_index = mid_loop_count * j;
976b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
980c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
996b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = matrix1_index_factor1 * j;
1006b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = k;
1016b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < inner_loop_count; n++) {
1026b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
1036b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix0_32x4 =
1046b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
1056b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
1066b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
1076b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += matrix1_index_step;
1086b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += matrix0_index_step;
1096b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
1106b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
1116b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index += 4;
1126b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
1136b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      if (mid_loop_count % 4 > 1) {
1140c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org        sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
1156b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = matrix1_index_factor1 * j;
1166b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = k;
1176b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < inner_loop_count; n++) {
1186b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
1190c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org          matrix0_32x2 =
1206b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
1216b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
1226b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                     matrix0_32x2, 1);
1236b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
1246b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
1256b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
1266b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += matrix1_index_step;
1276b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += matrix0_index_step;
1286b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
1296b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
1306b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index += 2;
1316b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
1326b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    }
1336b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
1346b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  else if (matrix1_index_init_case == 0 &&
1356b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org           matrix1_index_step == 1 &&
1366b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org           matrix0_index_step == 1) {
1370c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org    int32x2_t multi_32x2 = vdup_n_s32(0);
1380c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org    int32x2_t matrix0_32x2 = vdup_n_s32(0);
1396b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    for (j = 0; j < SUBFRAMES; j++) {
1406b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix_prod_index = mid_loop_count * j;
1416b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      for (k = 0; k < mid_loop_count; k++) {
1420c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
1436b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = matrix1_index_factor1 * j;
1446b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = matrix0_index_factor1 * k;
1456b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
1466b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix1_32x4 =
1476b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
1486b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t matrix0_32x4 =
1496b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
1506b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
1516b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
1526b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += 4;
1536b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += 4;
1546b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
1556b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
1566b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        if (inner_loop_count % 4 > 1) {
1576b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          int32x2_t matrix1_32x2 =
1586b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
1590c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org          matrix0_32x2 =
1606b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
1616b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
1626b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                     matrix0_32x2, 1);
1636b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
1646b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
1656b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
1666b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
1676b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
1686b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
1696b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index++;
1706b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
1716b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    }
1726b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
1736b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  else {
1746b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    for (j = 0; j < SUBFRAMES; j++) {
1756b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix_prod_index = mid_loop_count * j;
1766b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      for (k=0; k < mid_loop_count; k++) {
1776b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        int32_t sum32 = 0;
1786b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
1796b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
1806b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        for (n = 0; n < inner_loop_count; n++) {
1816b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
1826b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org              matrix1[matrix1_index] << shift));
1836b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix1_index += matrix1_index_step;
1846b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          matrix0_index += matrix0_index_step;
1856b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        }
1866b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_product[matrix_prod_index] = sum32;
1876b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org        matrix_prod_index++;
1886b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      }
1896b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    }
1906b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
1916b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org}
1926b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org
1936b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.orgvoid WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
1946b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int32_t matrix1[],
1956b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      int32_t matrix_product[],
1966b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix0_index_factor,
1976b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org                                      const int matrix0_index_step) {
1986b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int j = 0, n = 0;
1996b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
2000c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org  int32x2_t sum_32x2 = vdup_n_s32(0);
2016b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  for (j = 0; j < SUBFRAMES; j++) {
2020c46af62e2cc171f324f4d7e518e34f88ce88a6dkma@webrtc.org    sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
2036b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    matrix1_index = 0;
2046b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    matrix0_index = matrix0_index_factor * j;
2056b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    for (n = SUBFRAMES; n > 0; n--) {
2066b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      int32x2_t matrix0_32x2 =
2076b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org          vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
2086b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
2096b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
2106b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
2116b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix1_index += 2;
2126b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org      matrix0_index += matrix0_index_step;
2136b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    }
2146b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    sum_32x2 = vshr_n_s32(sum_32x2, 3);
2156b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
2166b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org    matrix_prod_index += 2;
2176b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org  }
2186b12f9704ef671d4ec2ab14c531c967373c8b137kma@webrtc.org}
219