1a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin/*
2a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *
4a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  Use of this source code is governed by a BSD-style license
5a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  that can be found in the LICENSE file in the root of the source
6a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  tree. An additional intellectual property rights grant can be found
7a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  in the file PATENTS.  All contributing project authors may
8a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *  be found in the AUTHORS file in the root of the source tree.
9a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin */
10a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
11a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin/*
12a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * filters_neon.c
13a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *
14a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for
15a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * ARM Neon platform.
16a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *
17a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin */
18a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
19a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include <arm_neon.h>
20a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include <assert.h>
21a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
22a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include "codec.h"
23a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
24a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin// Autocorrelation function in fixed point.
25a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin// NOTE! Different from SPLIB-version in how it scales the signal.
26a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkinint WebRtcIsacfix_AutocorrNeon(
27a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    WebRtc_Word32* __restrict r,
28a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    const WebRtc_Word16* __restrict x,
29a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    WebRtc_Word16 N,
30a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    WebRtc_Word16 order,
31a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    WebRtc_Word16* __restrict scale) {
32a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
33a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // The 1st for loop assumed N % 4 == 0.
34a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  assert(N % 4 == 0);
35a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
36a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int i = 0;
37a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int zeros_low = 0;
38a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int zeros_high = 0;
39a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int16_t scaling = 0;
40a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int32_t sum = 0;
41a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
42a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Step 1, calculate r[0] and how much scaling is needed.
43a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
44a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int16x4_t reg16x4;
45a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int64x1_t reg64x1a;
46a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int64x1_t reg64x1b;
47a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int32x4_t reg32x4;
48a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
49a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
50a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Loop over the samples and do:
51a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
52a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  for (i = 0; i < N; i += 4) {
53a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    reg16x4 = vld1_s16(&x[i]);
54a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    reg32x4 = vmull_s16(reg16x4, reg16x4);
55a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    reg64x2 = vpadalq_s32(reg64x2, reg32x4);
56a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  }
57a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  reg64x1a = vget_low_s64(reg64x2);
58a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  reg64x1b = vget_high_s64(reg64x2);
59a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  reg64x1a = vadd_s64(reg64x1a, reg64x1b);
60a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
61a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Calculate the value of shifting (scaling).
62a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  __asm__ __volatile__(
63a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    "vmov %[z_l], %[z_h], %P[reg]\n\t"
64a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    "clz %[z_l], %[z_l]\n\t"
65a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    "clz %[z_h], %[z_h]\n\t"
66a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    :[z_l]"+r"(zeros_low),
67a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin     [z_h]"+r"(zeros_high)
68a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    :[reg]"w"(reg64x1a)
69a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  );
70a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  if (zeros_high != 32) {
71a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    scaling = (32 - zeros_high + 1);
72a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  } else if (zeros_low == 0) {
73a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    scaling = 1;
74a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  }
75a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  reg64x1b = -scaling;
76a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  reg64x1a = vshl_s64(reg64x1a, reg64x1b);
77a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
78a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Record the result.
79a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
80a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
81a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
82a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Step 2, perform the actual correlation calculation.
83a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
84a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  /* Original C code (for the rest of the function):
85a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  for (i = 1; i < order + 1; i++)  {
86a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    prod = 0;
87a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    for (j = 0; j < N - i; j++) {
88a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
89a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    }
90a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    sum = (int32_t)(prod >> scaling);
91a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    r[i] = sum;
92a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  }
93a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  */
94a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
95a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  for (i = 1; i < order + 1; i++) {
96a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    int32_t prod_lower = 0;
97a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    int32_t prod_upper = 0;
98a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    const int16_t* ptr0 = &x[0];
99a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    const int16_t* ptr1 = &x[i];
100a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    int32_t tmp = 0;
101a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
102a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    // Initialize the sum (q9) to zero.
103a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
104a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
105a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    // Calculate the major block of the samples (a multiple of 8).
106a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    for (; ptr0 < &x[N - i - 7];) {
107a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      __asm__ __volatile__(
108a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
109a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
110a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vmull.s16 q12, d20, d22\n\t"
111a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vmull.s16 q13, d21, d23\n\t"
112a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vpadal.s32 q9, q12\n\t"
113a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "vpadal.s32 q9, q13\n\t"
114a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
115a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        // Specify constraints.
116a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        :[ptr0]"+r"(ptr0),
117a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        [ptr1]"+r"(ptr1)
118a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        :
119a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
120a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      );
121a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    }
122a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
123a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    // Calculate the rest of the samples.
124a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
125a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      __asm__ __volatile__(
126a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
127a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
128a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
129a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
130a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        // Specify constraints.
131a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        :[prod_lower]"+r"(prod_lower),
132a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        [prod_upper]"+r"(prod_upper),
133a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        [tmp]"+r"(tmp)
134a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        :[ptr0]"r"(*ptr0),
135a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin        [ptr1]"r"(*ptr1)
136a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      );
137a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    }
138a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
139a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    // Sum the results up, and do shift.
140a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    __asm__ __volatile__(
141a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vadd.i64 d18, d19\n\t"
142a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vmov.32 d17[0], %[prod_lower]\n\t"
143a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vmov.32 d17[1], %[prod_upper]\n\t"
144a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vadd.i64 d17, d18\n\t"
145a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "mov %[tmp], %[scaling], asr #31\n\t"
146a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vmov.32 d16, %[scaling], %[tmp]\n\t"
147a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vshl.s64 d17, d16\n\t"
148a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      "vmov.32 %[sum], d17[0]\n\t"
149a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
150a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      // Specify constraints.
151a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      :[sum]"=r"(sum),
152a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      [tmp]"+r"(tmp)
153a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      :[prod_upper]"r"(prod_upper),
154a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      [prod_lower]"r"(prod_lower),
155a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      [scaling]"r"(-scaling)
156a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin      :"d16", "d17", "d18", "d19"
157a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    );
158a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
159a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    // Record the result.
160a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin    r[i] = sum;
161a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  }
162a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
163a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  // Record the result.
164a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  *scale = scaling;
165a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin
166a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin  return(order + 1);
167a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin}
168