fix/source/pitch_filter_armv6.S

@
@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
@
@ Use of this source code is governed by a BSD-style license
@ that can be found in the LICENSE file in the root of the source
@ tree. An additional intellectual property rights grant can be found
@ in the file PATENTS.  All contributing project authors may
@ be found in the AUTHORS file in the root of the source tree.
@

@ Contains the core loop routine for the pitch filter function in iSAC,
@ optimized for ARMv7 platforms.
@
@ Output is bit-exact with the reference C code in pitch_filter.c.

#include "settings.h"

.arch armv6
.align  2
.global WebRtcIsacfix_PitchFilterCore


@ void WebRtcIsacfix_PitchFilterCore(int loopNumber,
@                                    WebRtc_Word16 gain,
@                                    int index,
@                                    WebRtc_Word16 sign,
@                                    WebRtc_Word16* inputState,
@                                    WebRtc_Word16* outputBuf2,
@                                    const WebRtc_Word16* coefficient,
@                                    WebRtc_Word16* inputBuf,
@                                    WebRtc_Word16* outputBuf,
@                                    int* index2) {

WebRtcIsacfix_PitchFilterCore:
.fnstart
  push {r4-r11}
  sub sp, #8

  str r0, [sp]                @ loopNumber
  str r3, [sp, #4]            @ sign
  ldr r3, [sp, #44]           @ outputBuf2
  ldr r6, [sp, #60]           @ index2
  ldr r7, [r6]                @ *index2
  ldr r8, [sp, #52]           @ inputBuf
  ldr r12, [sp, #56]          @ outputBuf

  add r4, r7, r0
  str r4, [r6]                @ Store return value to index2.

  mov r10, r7, asl #1
  add r12, r10                @ &outputBuf[*index2]
  add r8, r10                 @ &inputBuf[*index2]

  add r4, r7, #PITCH_BUFFSIZE @ *index2 + PITCH_BUFFSIZE
  add r6, r3, r4, lsl #1      @ &outputBuf2[*index2 + PITCH_BUFFSIZE]
  sub r4, r2                  @ r2: index
  sub r4, #2                  @ *index2 + PITCH_BUFFSIZE - index - 2
  add r3, r4, lsl #1          @ &ubufQQpos2[*index2]
  ldr r9, [sp, #48]           @ coefficient

LOOP:
@ Usage of registers in the loop:
@  r0: loop counter
@  r1: gain
@  r2: tmpW32
@  r3: &ubufQQpos2[]
@  r6: &outputBuf2[]
@  r8: &inputBuf[]
@  r9: &coefficient[]
@  r12: &outputBuf[]
@  r4, r5, r7, r10, r11: scratch

  @ Filter to get fractional pitch.
  @ The pitch filter loop here is unrolled with 9 multipications.
  pld [r3]
  ldr r10, [r3], #4           @ ubufQQpos2[*index2 + 0, *index2 + 1]
  ldr r4, [r9], #4            @ coefficient[0, 1]
  ldr r11, [r3], #4
  ldr r5, [r9], #4
  smuad r2, r10, r4
  smlad r2, r11, r5, r2

  ldr r10, [r3], #4
  ldr r4, [r9], #4
  ldr r11, [r3], #4
  ldr r5, [r9], #4
  smlad r2, r10, r4, r2
  ldrh r10, [r3], #-14        @ r3 back to &ubufQQpos2[*index2].
  ldrh  r4, [r9], #-16        @ r9 back to &coefficient[0].
  smlad r2, r11, r5, r2
  smlabb r2, r10, r4, r2

  @ Saturate to avoid overflow in tmpW16.
  asr r2, #1
  add r4, r2, #0x1000
  ssat r7, #16, r4, asr #13

  @ Shift low pass filter state, and excute the low pass filter.
  @ The memmove() and the low pass filter loop are unrolled and mixed.
  smulbb r5, r1, r7
  add r7, r5, #0x800
  asr r7, #12                 @ Get the value for inputState[0].
  ldr r11, [sp, #40]          @ inputState
  pld [r11]
  adr r10, kDampFilter
  ldrsh r4, [r10], #2         @ kDampFilter[0]
  mul r2, r7, r4
  ldr r4, [r11]               @ inputState[0, 1], before shift.
  strh r7, [r11]              @ inputState[0], after shift.
  ldr r5, [r11, #4]           @ inputState[2, 3], before shift.
  ldr r7, [r10], #4           @ kDampFilter[1, 2]
  ldr r10, [r10]              @ kDampFilter[3, 4]
  str r4, [r11, #2]           @ inputState[1, 2], after shift.
  str r5, [r11, #6]           @ inputState[3, 4], after shift.
  smlad r2, r4, r7, r2
  smlad r2, r5, r10, r2

  @ Saturate to avoid overflow.
  @ First shift the sample to the range of [0xC0000000, 0x3FFFFFFF],
  @ to avoid overflow in the next saturation step.
  asr r2, #1
  add r10, r2, #0x2000
  ssat r10, #16, r10, asr #14

  @ Subtract from input and update buffer.
  ldr r11, [sp, #4]           @ sign
  ldrsh r4, [r8]
  ldrsh r7, [r8], #2          @ inputBuf[*index2]
  smulbb r5, r11, r10
  subs r0, #1
  sub r4, r5
  ssat r2, #16, r4
  strh  r2, [r12], #2         @ outputBuf[*index2]

  add r2, r7
  ssat r2, #16, r2
  strh  r2, [r6], #2          @ outputBuff2[*index2 + PITCH_BUFFSIZE]
  bgt LOOP

  add sp, #8
  pop {r4-r11}
  bx  lr
.fnend

.align  2
kDampFilter:
  .short  -2294, 8192, 20972, 8192, -2294