127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@
227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@
427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Use of this source code is governed by a BSD-style license
527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ that can be found in the LICENSE file in the root of the source
627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ tree. An additional intellectual property rights grant can be found
727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ in the file PATENTS.  All contributing project authors may
827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ be found in the AUTHORS file in the root of the source tree.
927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@
1027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Reference code in filters.c. Output is bit-exact.
1127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
12b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org#include "webrtc/system_wrappers/interface/asm_defines.h"
1327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
14b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgGLOBAL_FUNCTION WebRtcIsacfix_AutocorrNeon
1527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org.align  2
1627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
1727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ int WebRtcIsacfix_AutocorrNeon(
18fbda0fcf2f9e82c82bcaac138f44d4e5144f6e0dpbos@webrtc.org@     int32_t* __restrict r,
19fbda0fcf2f9e82c82bcaac138f44d4e5144f6e0dpbos@webrtc.org@     const int16_t* __restrict x,
20fbda0fcf2f9e82c82bcaac138f44d4e5144f6e0dpbos@webrtc.org@     int16_t N,
21fbda0fcf2f9e82c82bcaac138f44d4e5144f6e0dpbos@webrtc.org@     int16_t order,
22fbda0fcf2f9e82c82bcaac138f44d4e5144f6e0dpbos@webrtc.org@     int16_t* __restrict scale);
2327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
24b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgDEFINE_FUNCTION WebRtcIsacfix_AutocorrNeon
2527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  push       {r3 - r12}
2627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
2727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ Constant initializations
2827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r4, #33
2927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   d0, #0
3027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   q8, #0
3127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   d29, #0               @ Initialize (-scale).
3227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.u8    d30, #255             @ Initialize d30 as -1.
3327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   d0[0], r4             @ d0: 00000033 (low), 00000000 (high)
3427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   d25, #32
3527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
3627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r5, r1                @ x
3727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r6, r2                @ N
3827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
3927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Generate the first coefficient r0.
4027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgLOOP_R0:
4127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vld1.16    {d18}, [r5]!          @ x[]
4227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  subs       r6, r6, #4
4327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmull.s16  q9, d18, d18
4427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vpadal.s32 q8, q9
4527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  bgt        LOOP_R0
4627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
4727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vadd.i64   d16, d16, d17
4827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
4927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ Calculate scaling (the value of shifting).
5027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov       d17, d16
5127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
5227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ Check overflow and determine the value for 'scale'.
5327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ vclz cannot deal with a 64-bit, so we have to do vclz on both the upper and
5427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ lower 32-bit words. Note that we don't care about the value of the upper
5527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ word in d17.
5627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
5727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ Check the case of 1 bit overflow. If it occurs store the results for
5827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ scale and r[0] in d17 and d29.
5927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
6027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vshr.u64   d3, d16, #1
6127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vclt.s32   d1, d16, #0           @ < 0 ?
6227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vbit       d17, d3, d1           @ For r[0]
6327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vbit       d29, d30, d1          @ -scale = -1
6427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
6527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ For the case of more than 1 bit overflow. If it occurs overwrite the
6627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  @ results for scale and r[0] in d17 and d29.
6727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vclz.s32   d5, d16               @ Leading zeros of the two 32 bit words.
6827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vshr.s64   d26, d5, #32          @ Keep only the upper 32 bits.
6927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vsub.i64   d31, d26, d0          @ zeros - 33
7027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vshl.i64   d27, d26, #32
7127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vorr       d27, d26              @ Duplicate the high word with its low one.
7227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vshl.u64   d2, d16, d31          @ Shift by (-scale).
7327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vclt.s32   d1, d27, d25          @ < 32 ?
7427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vbit       d17, d2, d1           @ For r[0]
7527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vbit       d29, d31, d1          @ -scale
7627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
7727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vst1.32    d17[0], [r0]!         @ r[0]
7827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r5, #1                @ outer loop counter
7927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
8027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Generate rest of the coefficients
8127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgLOOP_R:
8227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   q8, #0                @ Initialize the accumulation result.
8327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov.i32   q9, #0                @ Initialize the accumulation result.
8427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r7, r1                @ &x[0]
8527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  add        r6, r7, r5, lsl #1    @ x[i]
8627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  sub        r12, r2, r5           @ N - i
8727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  lsr        r8, r12, #3           @ inner loop counter
8827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  sub        r12, r8, lsl #3       @ Leftover samples to be processed
8927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
9027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgLOOP_8X_SAMPLES:                   @ Multiple of 8 samples
9127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vld1.16    {d20, d21}, [r7]!     @ x[0, ...]
9227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vld1.16    {d22, d23}, [r6]!     @ x[i, ...]
9327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmull.s16  q12, d20, d22
9427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmull.s16  q13, d21, d23
9527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  subs       r8, #1
9627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vpadal.s32 q8, q12
9727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vpadal.s32 q9, q13
9827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  bgt        LOOP_8X_SAMPLES
9927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
10027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  cmp r12, #4
10127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  blt REST_SAMPLES
10227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
10327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgFour_SAMPLES:
10427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vld1.16    d20, [r7]!
10527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vld1.16    d22, [r6]!
10627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmull.s16  q12, d20, d22
10727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vpadal.s32 q8, q12
10827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  sub r12, #4
10927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
11027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgREST_SAMPLES:
11127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r8, #0                @ Initialize lower word of the accumulation.
11227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  mov        r4, #0                @ Initialize upper word of the accumulation.
11327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  cmp r12, #0
11427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  ble SUMUP
11527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
11627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgLOOP_REST_SAMPLES:
11727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  ldrh       r9, [r7], #2          @ x[0, ...]
11827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  ldrh       r10, [r6], #2         @ x[i, ...]
11927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  smulbb     r11, r9, r10
12027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  adds       r8, r8, r11           @ lower word of the accumulation.
12127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  adc        r4, r4, r11, asr #31  @ upper word of the accumulation.
12227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  subs       r12, #1
12327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  bgt        LOOP_REST_SAMPLES
12427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
12527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org@ Added the multiplication results together and do a shift.
12627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.orgSUMUP:
12727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vadd.i64   d16, d17
12827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vadd.i64   d18, d19
12927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vadd.i64   d18, d16
13027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vmov       d17, r8, r4
13127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vadd.i64   d18, d17
13227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vshl.s64   d18, d29              @ Shift left by (-scale).
13327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vst1.32    d18[0], [r0]!         @ r[i]
13427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
13527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  add        r5, #1
13627fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  cmp        r5, r3
13727fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  ble        LOOP_R
13827fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
13927fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vneg.s32   d29, d29              @ Get value for 'scale'.
14027fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  ldr        r2, [sp, #40]         @ &scale
14127fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  add        r0, r3, #1            @ return (order + 1)
14227fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  vst1.s16   d29[0], [r2]          @ Store 'scale'
14327fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org
14427fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  pop        {r3 - r12}
14527fe999a8fe09ea7a39318e00262b30e38caa83ckma@webrtc.org  bx         lr
146