1/*
2 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include <assert.h>
13
14#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
15
16// Autocorrelation function in fixed point.
17// NOTE! Different from SPLIB-version in how it scales the signal.
18int WebRtcIsacfix_AutocorrNeon(int32_t* __restrict r,
19                               const int16_t* x,
20                               int16_t n,
21                               int16_t order,
22                               int16_t* __restrict scale) {
23  int i = 0;
24  int16_t scaling = 0;
25  uint32_t temp = 0;
26  int64_t prod = 0;
27  int64_t prod_tail = 0;
28
29  assert(n % 4 == 0);
30  assert(n >= 8);
31
32  // Calculate r[0].
33  int16x4_t x0_v;
34  int32x4_t tmpa0_v;
35  int64x2_t tmpb_v;
36
37  tmpb_v = vdupq_n_s64(0);
38  const int16_t* x_start = x;
39  const int16_t* x_end0 = x_start + n;
40  while (x_start < x_end0) {
41    x0_v = vld1_s16(x_start);
42    tmpa0_v = vmull_s16(x0_v, x0_v);
43    tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
44    x_start += 4;
45  }
46
47#ifdef WEBRTC_ARCH_ARM64
48  prod = vaddvq_s64(tmpb_v);
49#else
50  prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
51                       0);
52#endif
53  // Calculate scaling (the value of shifting).
54  temp = (uint32_t)(prod >> 31);
55
56  scaling = temp ? 32 - WebRtcSpl_NormU32(temp) : 0;
57  r[0] = (int32_t)(prod >> scaling);
58
59  int16x8_t x1_v;
60  int16x8_t y_v;
61  int32x4_t tmpa1_v;
62  // Perform the actual correlation calculation.
63  for (i = 1; i < order + 1; i++) {
64    tmpb_v = vdupq_n_s64(0);
65    int rest = (n - i) % 8;
66    x_start = x;
67    x_end0 = x_start + n - i - rest;
68    const int16_t* y_start = x_start + i;
69    while (x_start < x_end0) {
70      x1_v = vld1q_s16(x_start);
71      y_v = vld1q_s16(y_start);
72      tmpa0_v = vmull_s16(vget_low_s16(x1_v), vget_low_s16(y_v));
73#ifdef WEBRTC_ARCH_ARM64
74      tmpa1_v = vmull_high_s16(x1_v, y_v);
75#else
76      tmpa1_v = vmull_s16(vget_high_s16(x1_v), vget_high_s16(y_v));
77#endif
78      tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
79      tmpb_v = vpadalq_s32(tmpb_v, tmpa1_v);
80      x_start += 8;
81      y_start += 8;
82    }
83    // The remaining calculation.
84    const int16_t* x_end1 = x + n - i;
85    if (rest >= 4) {
86        int16x4_t x2_v = vld1_s16(x_start);
87        int16x4_t y2_v = vld1_s16(y_start);
88        tmpa0_v = vmull_s16(x2_v, y2_v);
89        tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
90        x_start += 4;
91        y_start += 4;
92    }
93#ifdef WEBRTC_ARCH_ARM64
94    prod = vaddvq_s64(tmpb_v);
95#else
96    prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
97                         0);
98#endif
99
100    prod_tail = 0;
101    while (x_start < x_end1) {
102      prod_tail += *x_start * *y_start;
103      ++x_start;
104      ++y_start;
105    }
106
107    r[i] = (int32_t)((prod + prod_tail) >> scaling);
108  }
109
110  *scale = scaling;
111
112  return order + 1;
113}
114
115