1@
2@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3@
4@ Use of this source code is governed by a BSD-style license
5@ that can be found in the LICENSE file in the root of the source
6@ tree. An additional intellectual property rights grant can be found
7@ in the file PATENTS.  All contributing project authors may
8@ be found in the AUTHORS file in the root of the source tree.
9@
10
11@ vector_scaling_operations_neon.s
12@ This file contains the function WebRtcSpl_ScaleAndAddVectorsWithRoundNeon(),
13@ optimized for ARM Neon platform. Output is bit-exact with the reference
14@ C code in vector_scaling_operations.c.
15
16#include "webrtc/system_wrappers/interface/asm_defines.h"
17
18GLOBAL_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
19.align  2
20DEFINE_FUNCTION WebRtcSpl_ScaleAndAddVectorsWithRoundNeon
21  push {r4-r9}
22
23  ldr r4, [sp, #32]           @ length
24  ldr r5, [sp, #28]           @ out_vector
25  ldrsh r6, [sp, #24]         @ right_shifts
26
27  cmp r4, #0
28  ble END                     @ Return if length <= 0.
29
30  cmp r4, #8
31  blt SET_ROUND_VALUE
32
33  vdup.16 d26, r1             @ in_vector1_scale
34  vdup.16 d27, r3             @ in_vector2_scale
35
36  @ Neon instructions can only right shift by an immediate value. To shift right
37  @ by a register value, we have to do a left shift left by the negative value.
38  rsb r7, r6, #0
39  vdup.16 q12, r7             @ -right_shifts
40
41  bic r7, r4, #7              @ Counter for LOOP_UNROLLED_BY_8: length / 8 * 8.
42
43LOOP_UNROLLED_BY_8:
44  vld1.16 {d28, d29}, [r0]!   @ in_vector1[]
45  vld1.16 {d30, d31}, [r2]!   @ in_vector2[]
46  vmull.s16 q0, d28, d26
47  vmull.s16 q1, d29, d26
48  vmull.s16 q2, d30, d27
49  vmull.s16 q3, d31, d27
50  vadd.s32 q0, q2
51  vadd.s32 q1, q3
52  vrshl.s32 q0, q12           @ Round shift right by right_shifts.
53  vrshl.s32 q1, q12
54  vmovn.i32 d0, q0            @ Cast to 16 bit values.
55  vmovn.i32 d1, q1
56  subs r7, #8
57  vst1.16 {d0, d1}, [r5]!
58  bgt LOOP_UNROLLED_BY_8
59
60  ands r4, #0xFF              @ Counter for LOOP_NO_UNROLLING: length % 8.
61  beq END
62
63SET_ROUND_VALUE:
64  mov r9, #1
65  lsl r9, r6
66  lsr r9, #1
67
68LOOP_NO_UNROLLING:
69  ldrh  r7, [r0], #2
70  ldrh  r8, [r2], #2
71  smulbb r7, r7, r1
72  smulbb r8, r8, r3
73  subs r4, #1
74  add r7, r9
75  add r7, r8
76  asr r7, r6
77  strh r7, [r5], #2
78  bne LOOP_NO_UNROLLING
79
80END:
81  pop {r4-r9}
82  bx  lr
83