1/*
2 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11/* This file contains WebRtcIsacfix_MatrixProduct1Neon() and
12 * WebRtcIsacfix_MatrixProduct2Neon() for ARM Neon platform. API's are in
13 * entropy_coding.c. Results are bit exact with the c code for
14 * generic platforms.
15 */
16
17#include "entropy_coding.h"
18
19#include <arm_neon.h>
20#include <assert.h>
21#include <stddef.h>
22
23#include "signal_processing_library.h"
24
25void WebRtcIsacfix_MatrixProduct1Neon(const int16_t matrix0[],
26                                      const int32_t matrix1[],
27                                      int32_t matrix_product[],
28                                      const int matrix1_index_factor1,
29                                      const int matrix0_index_factor1,
30                                      const int matrix1_index_init_case,
31                                      const int matrix1_index_step,
32                                      const int matrix0_index_step,
33                                      const int inner_loop_count,
34                                      const int mid_loop_count,
35                                      const int shift) {
36  int j = 0, k = 0, n = 0;
37  int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
38  int* matrix1_index_factor2 = &j;
39  int* matrix0_index_factor2 = &k;
40  if (matrix1_index_init_case != 0) {
41    matrix1_index_factor2 = &k;
42    matrix0_index_factor2 = &j;
43  }
44  int32x4_t shift32x4 = vdupq_n_s32(shift);
45  int32x2_t shift32x2 = vdup_n_s32(shift);
46  int32x4_t sum_32x4 =  vdupq_n_s32(0);
47  int32x2_t sum_32x2 =  vdup_n_s32(0);
48
49  assert(inner_loop_count % 2 == 0);
50  assert(mid_loop_count % 2 == 0);
51
52  if (matrix1_index_init_case != 0 && matrix1_index_factor1 == 1) {
53    for (j = 0; j < SUBFRAMES; j++) {
54      matrix_prod_index = mid_loop_count * j;
55      for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
56        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
57        matrix1_index = k;
58        matrix0_index = matrix0_index_factor1 * j;
59        for (n = 0; n < inner_loop_count; n++) {
60          int32x4_t matrix0_32x4 =
61              vdupq_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
62          int32x4_t matrix1_32x4 =
63              vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
64          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
65          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
66          matrix1_index += matrix1_index_step;
67          matrix0_index += matrix0_index_step;
68        }
69        vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
70        matrix_prod_index += 4;
71      }
72      if (mid_loop_count % 4 > 1) {
73        sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
74        matrix1_index = k;
75        k += 2;
76        matrix0_index = matrix0_index_factor1 * j;
77        for (n = 0; n < inner_loop_count; n++) {
78          int32x2_t matrix0_32x2 =
79              vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
80          int32x2_t matrix1_32x2 =
81              vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
82          int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
83          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
84          matrix1_index += matrix1_index_step;
85          matrix0_index += matrix0_index_step;
86        }
87        vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
88        matrix_prod_index += 2;
89      }
90    }
91  }
92  else if (matrix1_index_init_case == 0 && matrix0_index_factor1 == 1) {
93    int32x2_t multi_32x2 = vdup_n_s32(0);
94    int32x2_t matrix0_32x2 = vdup_n_s32(0);
95    for (j = 0; j < SUBFRAMES; j++) {
96      matrix_prod_index = mid_loop_count * j;
97      for (k = 0; k < (mid_loop_count >> 2) << 2; k += 4) {
98        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
99        matrix1_index = matrix1_index_factor1 * j;
100        matrix0_index = k;
101        for (n = 0; n < inner_loop_count; n++) {
102          int32x4_t matrix1_32x4 = vdupq_n_s32(matrix1[matrix1_index] << shift);
103          int32x4_t matrix0_32x4 =
104              vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
105          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
106          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
107          matrix1_index += matrix1_index_step;
108          matrix0_index += matrix0_index_step;
109        }
110        vst1q_s32(&matrix_product[matrix_prod_index], sum_32x4);
111        matrix_prod_index += 4;
112      }
113      if (mid_loop_count % 4 > 1) {
114        sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
115        matrix1_index = matrix1_index_factor1 * j;
116        matrix0_index = k;
117        for (n = 0; n < inner_loop_count; n++) {
118          int32x2_t matrix1_32x2 = vdup_n_s32(matrix1[matrix1_index] << shift);
119          matrix0_32x2 =
120              vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
121          matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
122                                     matrix0_32x2, 1);
123          matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
124          multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
125          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
126          matrix1_index += matrix1_index_step;
127          matrix0_index += matrix0_index_step;
128        }
129        vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
130        matrix_prod_index += 2;
131      }
132    }
133  }
134  else if (matrix1_index_init_case == 0 &&
135           matrix1_index_step == 1 &&
136           matrix0_index_step == 1) {
137    int32x2_t multi_32x2 = vdup_n_s32(0);
138    int32x2_t matrix0_32x2 = vdup_n_s32(0);
139    for (j = 0; j < SUBFRAMES; j++) {
140      matrix_prod_index = mid_loop_count * j;
141      for (k = 0; k < mid_loop_count; k++) {
142        sum_32x4 = veorq_s32(sum_32x4, sum_32x4);  // Initialize to zeros.
143        matrix1_index = matrix1_index_factor1 * j;
144        matrix0_index = matrix0_index_factor1 * k;
145        for (n = 0; n < (inner_loop_count >> 2) << 2; n += 4) {
146          int32x4_t matrix1_32x4 =
147              vshlq_s32(vld1q_s32(&matrix1[matrix1_index]), shift32x4);
148          int32x4_t matrix0_32x4 =
149              vshll_n_s16(vld1_s16(&matrix0[matrix0_index]), 15);
150          int32x4_t multi_32x4 = vqdmulhq_s32(matrix0_32x4, matrix1_32x4);
151          sum_32x4 = vqaddq_s32(sum_32x4, multi_32x4);
152          matrix1_index += 4;
153          matrix0_index += 4;
154        }
155        sum_32x2 = vqadd_s32(vget_low_s32(sum_32x4), vget_high_s32(sum_32x4));
156        if (inner_loop_count % 4 > 1) {
157          int32x2_t matrix1_32x2 =
158              vshl_s32(vld1_s32(&matrix1[matrix1_index]), shift32x2);
159          matrix0_32x2 =
160              vset_lane_s32((int32_t)matrix0[matrix0_index], matrix0_32x2, 0);
161          matrix0_32x2 = vset_lane_s32((int32_t)matrix0[matrix0_index + 1],
162                                     matrix0_32x2, 1);
163          matrix0_32x2 = vshl_n_s32(matrix0_32x2, 15);
164          multi_32x2 = vqdmulh_s32(matrix1_32x2, matrix0_32x2);
165          sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
166        }
167        sum_32x2 = vpadd_s32(sum_32x2, sum_32x2);
168        vst1_lane_s32(&matrix_product[matrix_prod_index], sum_32x2, 0);
169        matrix_prod_index++;
170      }
171    }
172  }
173  else {
174    for (j = 0; j < SUBFRAMES; j++) {
175      matrix_prod_index = mid_loop_count * j;
176      for (k=0; k < mid_loop_count; k++) {
177        int32_t sum32 = 0;
178        matrix1_index = matrix1_index_factor1 * (*matrix1_index_factor2);
179        matrix0_index = matrix0_index_factor1 * (*matrix0_index_factor2);
180        for (n = 0; n < inner_loop_count; n++) {
181          sum32 += (WEBRTC_SPL_MUL_16_32_RSFT16(matrix0[matrix0_index],
182              matrix1[matrix1_index] << shift));
183          matrix1_index += matrix1_index_step;
184          matrix0_index += matrix0_index_step;
185        }
186        matrix_product[matrix_prod_index] = sum32;
187        matrix_prod_index++;
188      }
189    }
190  }
191}
192
193void WebRtcIsacfix_MatrixProduct2Neon(const int16_t matrix0[],
194                                      const int32_t matrix1[],
195                                      int32_t matrix_product[],
196                                      const int matrix0_index_factor,
197                                      const int matrix0_index_step) {
198  int j = 0, n = 0;
199  int matrix1_index = 0, matrix0_index = 0, matrix_prod_index = 0;
200  int32x2_t sum_32x2 = vdup_n_s32(0);
201  for (j = 0; j < SUBFRAMES; j++) {
202    sum_32x2 = veor_s32(sum_32x2, sum_32x2);  // Initialize to zeros.
203    matrix1_index = 0;
204    matrix0_index = matrix0_index_factor * j;
205    for (n = SUBFRAMES; n > 0; n--) {
206      int32x2_t matrix0_32x2 =
207          vdup_n_s32((int32_t)(matrix0[matrix0_index]) << 15);
208      int32x2_t matrix1_32x2 = vld1_s32(&matrix1[matrix1_index]);
209      int32x2_t multi_32x2 = vqdmulh_s32(matrix0_32x2, matrix1_32x2);
210      sum_32x2 = vqadd_s32(sum_32x2, multi_32x2);
211      matrix1_index += 2;
212      matrix0_index += matrix0_index_step;
213    }
214    sum_32x2 = vshr_n_s32(sum_32x2, 3);
215    vst1_s32(&matrix_product[matrix_prod_index], sum_32x2);
216    matrix_prod_index += 2;
217  }
218}
219