1@
2@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ lattice_neon.s
12@
13@ Contains a function for the core loop in the normalized lattice MA
14@ filter routine for iSAC codec, optimized for ARM Neon platform.
15@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,
16@                                     int16_t input1,
17@                                     int32_t input2,
18@                                     int32_t* ptr0,
19@                                     int32_t* ptr1,
20@                                     int32_t* __restrict ptr2);
21@ It calculates
22@   *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
23@   *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
24@ in Q15 domain.
25@
26@ Reference code in lattice.c.
27@ Output is not bit-exact with the reference C code, due to the replacement
28@ of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
29@ instructions, smulwb, and smull. Speech quality was not degraded by
30@ testing speech and tone vectors.
31
32.arch armv7-a
33.fpu neon
34
35#include "settings.h"
36
37.global WebRtcIsacfix_FilterMaLoopNeon
38
39.align  2
40
41WebRtcIsacfix_FilterMaLoopNeon:
42.fnstart
43
44.save {r4-r8}
45  push        {r4-r8}
46
47  vdup.32     d28, r0             @ Initialize Neon register with input0
48  vdup.32     d29, r1             @ Initialize Neon register with input1
49  vdup.32     d30, r2             @ Initialize Neon register with input2
50  ldr         r4, [sp, #20]       @ ptr1
51  ldr         r12, [sp, #24]      @ ptr2
52
53  @ Number of loop iterations after unrolling: r5 = (HALF_SUBFRAMELEN - 1) >> 2
54  @ Leftover samples after the loop, in r6:
55  @    r6 = (HALF_SUBFRAMELEN - 1) - (HALF_SUBFRAMELEN - 1) >> 2 << 2
56  mov         r6, #HALF_SUBFRAMELEN
57  sub         r6, #1
58  lsr         r5, r6, #2
59  sub         r6, r5, lsl #2
60
61  @ First r5 iterations in a loop.
62
63LOOP:
64  vld1.32     {d0, d1}, [r3]!     @ *ptr0
65
66  vmull.s32   q10, d0, d28        @ tmp32a = input0 * (*ptr0)
67  vmull.s32   q11, d1, d28        @ tmp32a = input0 * (*ptr0)
68  vmull.s32   q12, d0, d29        @ input1 * (*ptr0)
69  vmull.s32   q13, d1, d29        @ input1 * (*ptr0)
70
71  vrshrn.i64  d4, q10, #15
72  vrshrn.i64  d5, q11, #15
73
74  vld1.32     {d2, d3}, [r12]     @ *ptr2
75  vadd.i32    q3, q2, q1          @ tmp32b = *ptr2 + tmp32a
76
77  vrshrn.i64  d0, q12, #15
78
79  vmull.s32   q10, d6, d30        @ input2 * (*ptr2 + tmp32b)
80  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
81
82  vrshrn.i64  d16, q10, #16
83  vrshrn.i64  d17, q11, #16
84
85  vmull.s32   q10, d16, d28       @ input0 * (*ptr2)
86  vmull.s32   q11, d17, d28       @ input0 * (*ptr2)
87
88  vrshrn.i64  d1, q13, #15
89  vrshrn.i64  d18, q10, #15
90  vrshrn.i64  d19, q11, #15
91
92  vst1.32     {d16, d17}, [r12]!  @ *ptr2
93
94  vadd.i32    q9, q0, q9
95  subs        r5, #1
96  vst1.32     {d18, d19}, [r4]!   @ *ptr1
97
98  bgt         LOOP
99
100  @ Check how many samples still need to be processed.
101  subs        r6, #2
102  blt         LAST_SAMPLE
103
104  @ Process two more samples:
105  vld1.32     d0, [r3]!           @ *ptr0
106
107  vmull.s32   q11, d0, d28        @ tmp32a = input0 * (*ptr0)
108  vmull.s32   q13, d0, d29        @ input1 * (*ptr0)
109
110  vld1.32     d18, [r12]          @ *ptr2
111  vrshrn.i64  d4, q11, #15
112
113  vadd.i32    d7, d4, d18         @ tmp32b = *ptr2 + tmp32a
114  vmull.s32   q11, d7, d30        @ input2 * (*ptr2 + tmp32b)
115  vrshrn.i64  d16, q11, #16
116
117  vmull.s32   q11, d16, d28       @ input0 * (*ptr2)
118  vst1.32     d16, [r12]!         @ *ptr2
119
120  vrshrn.i64  d0, q13, #15
121  vrshrn.i64  d19, q11, #15
122  vadd.i32    d19, d0, d19
123
124  vst1.32     d19, [r4]!          @ *ptr1
125
126  @ If there's still one more sample, process it here.
127LAST_SAMPLE:
128  cmp         r6, #1
129  bne         END
130
131  @ *ptr2 = input2 * (*ptr2 + input0 * (*ptr0));
132
133  ldr         r7, [r3]            @ *ptr0
134  ldr         r8, [r12]           @ *ptr2
135
136  smulwb      r5, r7, r0          @ tmp32a = *ptr0 * input0 >> 16
137  add         r8, r8, r5, lsl #1  @ tmp32b = *ptr2 + (tmp32a << 1)
138  smull       r5, r6, r8, r2      @ tmp32b * input2, in 64 bits
139  lsl         r6, #16
140  add         r6, r5, lsr #16     @ Only take the middle 32 bits
141  str         r6, [r12]           @ Output (*ptr2, as 32 bits)
142
143  @ *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
144
145  smulwb      r5, r7, r1          @ tmp32a = *ptr0 * input1 >> 16
146  smulwb      r6, r6, r0          @ tmp32b = *ptr2 * input0 >> 16
147  lsl         r5, r5, #1
148  add         r5, r6, lsl #1
149  str         r5, [r4]            @ Output (*ptr1)
150
151END:
152  pop         {r4-r8}
153  bx          lr
154
155.fnend
156