1b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@
2b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@
4b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Use of this source code is governed by a BSD-style license
5b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ that can be found in the LICENSE file in the root of the source
6b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ tree. An additional intellectual property rights grant can be found
7b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ in the file PATENTS.  All contributing project authors may
8b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ be found in the AUTHORS file in the root of the source tree.
9b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@
10b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
11b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ cross_correlation_neon.s
12b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ This file contains the function WebRtcSpl_CrossCorrelationNeon(),
13b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ optimized for ARM Neon platform.
14b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@
15b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Reference Ccode at end of this file.
16b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Output is bit-exact with the reference C code, but not with the generic
17b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ C code in file cross_correlation.c, due to reduction of shift operations
18b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ from using Neon registers.
19b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
20b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Register usage:
21b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@
22b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r0: *cross_correlation (function argument)
23b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r1: *seq1 (function argument)
24b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r2: *seq2 (function argument)
25b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ
26b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r4: counter for LOOP_DIM_CROSS_CORRELATION
27b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r5: seq2_ptr
28b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r6: seq1_ptr
29b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL
30b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r8, r9, r10, r11, r12: scratch
31b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
32b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org#include "webrtc/system_wrappers/interface/asm_defines.h"
33b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
34b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgGLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon
35b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org.align  2
36b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgDEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon
37b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  push {r4-r11}
38b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
39b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  @ Put the shift value (-right_shifts) into a Neon register.
40b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ldrsh r10, [sp, #36]
41b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  rsb r10, r10, #0
42b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  mov r8, r10, asr #31
43b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org  vmov d16, r10, r8
44b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
45b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  @ Initialize loop counters.
46b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  and r7, r3, #7              @ inner_loop_len2 = dim_seq % 8;
47b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  asr r3, r3, #3              @ inner_loop_len1 = dim_seq / 8;
48b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ldrsh r4, [sp, #32]         @ dim_cross_correlation
49b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
50b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_CROSS_CORRELATION:
51b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmov.i32 q9, #0
52b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmov.i32 q14, #0
53b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  movs r8, r3                 @ inner_loop_len1
54b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  mov r6, r1                  @ seq1_ptr
55b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  mov r5, r2                  @ seq2_ptr
56b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ble POST_LOOP_DIM_SEQ
57b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
58b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_SEQ:
59b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vld1.16 {d20, d21}, [r6]!   @ seq1_ptr
60b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org  vld1.16 {d22, d23}, [r5]!   @ seq2_ptr
61b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  subs r8, r8, #1
62b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmull.s16 q12, d20, d22
63b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmull.s16 q13, d21, d23
64b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vpadal.s32 q9, q12
65b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vpadal.s32 q14, q13
66b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  bgt LOOP_DIM_SEQ
67b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
68b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgPOST_LOOP_DIM_SEQ:
69b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  movs r10, r7                @ Loop counter
70b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  mov r12, #0
71b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  mov r8, #0
72b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ble POST_LOOP_DIM_SEQ_RESIDUAL
73b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
74b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_SEQ_RESIDUAL:
75b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ldrh r11, [r6], #2
76b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ldrh r9, [r5], #2
77b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  smulbb r11, r11, r9
78b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  adds r8, r8, r11
79b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  adc r12, r12, r11, asr #31
80b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  subs r10, #1
81b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  bgt LOOP_DIM_SEQ_RESIDUAL
82b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
83b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgPOST_LOOP_DIM_SEQ_RESIDUAL:   @ Sum the results up and do the shift.
84b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vadd.i64 d18, d19
85b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vadd.i64 d28, d29
86b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vadd.i64 d18, d28
87b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmov.32 d17[0], r8
88b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vmov.32 d17[1], r12
89b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vadd.i64 d17, d18
90b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vshl.s64 d17, d16
91b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  vst1.32 d17[0], [r0]!       @ Store the output
92b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
93b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  ldr r8, [sp, #40]           @ step_seq2
94b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  add r2, r8, lsl #1          @ prepare for seq2_ptr(r5) in the next loop.
95b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
96b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  subs r4, #1
97b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  bgt LOOP_DIM_CROSS_CORRELATION
98b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
99b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  pop {r4-r11}
100b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org  bx  lr
101b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org
102b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ TODO(kma): Place this piece of reference code into a C code file.
103c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation,
104c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t* seq1,
105c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t* seq2,
106c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t dim_seq,
107c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t dim_cross_correlation,
108c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t right_shifts,
109c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@                                     int16_t step_seq2) {
110b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   int i = 0;
111b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   int j = 0;
112b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   int inner_loop_len1 = dim_seq >> 3;
113b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3);
114b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@
115b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   assert(dim_cross_correlation > 0);
116b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   assert(dim_seq > 0);
117b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@
118b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   for (i = 0; i < dim_cross_correlation; i++) {
119b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     int16_t *seq1_ptr = seq1;
120b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     int16_t *seq2_ptr = seq2 + (step_seq2 * i);
121b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     int64_t sum = 0;
122b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@
123b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     for (j = inner_loop_len1; j > 0; j -= 1) {
124b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
125b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
126b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
127b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
128b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
129b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
130b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
131b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
132b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
133b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
134b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
135b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
136b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
137b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
138b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
139b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
140b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
141b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
142b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
143b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
144b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
145b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
146b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
147b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
148b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     }
149b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@
150b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     // Calculate the rest of the samples.
151b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     for (j = inner_loop_len2; j > 0; j -= 1) {
152b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr);
153b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq1_ptr++;
154b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@       seq2_ptr++;
155b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     }
156b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@
157b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@     *cross_correlation++ = (int32_t)(sum >> right_shifts);
158b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@   }
159b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ }
160