1b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ 2b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ 4b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Use of this source code is governed by a BSD-style license 5b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ that can be found in the LICENSE file in the root of the source 6b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ tree. An additional intellectual property rights grant can be found 7b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ in the file PATENTS. All contributing project authors may 8b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ be found in the AUTHORS file in the root of the source tree. 9b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ 10b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 11b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ cross_correlation_neon.s 12b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ This file contains the function WebRtcSpl_CrossCorrelationNeon(), 13b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ optimized for ARM Neon platform. 14b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ 15b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Reference Ccode at end of this file. 16b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Output is bit-exact with the reference C code, but not with the generic 17b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ C code in file cross_correlation.c, due to reduction of shift operations 18b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ from using Neon registers. 19b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 20b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ Register usage: 21b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ 22b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r0: *cross_correlation (function argument) 23b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r1: *seq1 (function argument) 24b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r2: *seq2 (function argument) 25b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r3: dim_seq (function argument); then, total iteration of LOOP_DIM_SEQ 26b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r4: counter for LOOP_DIM_CROSS_CORRELATION 27b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r5: seq2_ptr 28b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r6: seq1_ptr 29b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r7: Total iteration of LOOP_DIM_SEQ_RESIDUAL 30b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ r8, r9, r10, r11, r12: scratch 31b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 32b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org#include "webrtc/system_wrappers/interface/asm_defines.h" 33b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 34b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgGLOBAL_FUNCTION WebRtcSpl_CrossCorrelationNeon 35b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org.align 2 36b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.orgDEFINE_FUNCTION WebRtcSpl_CrossCorrelationNeon 37b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org push {r4-r11} 38b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 39b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org @ Put the shift value (-right_shifts) into a Neon register. 40b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ldrsh r10, [sp, #36] 41b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org rsb r10, r10, #0 42b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org mov r8, r10, asr #31 43b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org vmov d16, r10, r8 44b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 45b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org @ Initialize loop counters. 46b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org and r7, r3, #7 @ inner_loop_len2 = dim_seq % 8; 47b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org asr r3, r3, #3 @ inner_loop_len1 = dim_seq / 8; 48b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ldrsh r4, [sp, #32] @ dim_cross_correlation 49b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 50b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_CROSS_CORRELATION: 51b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmov.i32 q9, #0 52b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmov.i32 q14, #0 53b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org movs r8, r3 @ inner_loop_len1 54b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org mov r6, r1 @ seq1_ptr 55b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org mov r5, r2 @ seq2_ptr 56b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ble POST_LOOP_DIM_SEQ 57b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 58b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_SEQ: 59b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vld1.16 {d20, d21}, [r6]! @ seq1_ptr 60b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org vld1.16 {d22, d23}, [r5]! @ seq2_ptr 61b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org subs r8, r8, #1 62b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmull.s16 q12, d20, d22 63b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmull.s16 q13, d21, d23 64b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vpadal.s32 q9, q12 65b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vpadal.s32 q14, q13 66b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org bgt LOOP_DIM_SEQ 67b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 68b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgPOST_LOOP_DIM_SEQ: 69b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org movs r10, r7 @ Loop counter 70b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org mov r12, #0 71b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org mov r8, #0 72b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ble POST_LOOP_DIM_SEQ_RESIDUAL 73b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 74b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgLOOP_DIM_SEQ_RESIDUAL: 75b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ldrh r11, [r6], #2 76b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ldrh r9, [r5], #2 77b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org smulbb r11, r11, r9 78b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org adds r8, r8, r11 79b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org adc r12, r12, r11, asr #31 80b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org subs r10, #1 81b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org bgt LOOP_DIM_SEQ_RESIDUAL 82b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 83b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.orgPOST_LOOP_DIM_SEQ_RESIDUAL: @ Sum the results up and do the shift. 84b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vadd.i64 d18, d19 85b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vadd.i64 d28, d29 86b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vadd.i64 d18, d28 87b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmov.32 d17[0], r8 88b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vmov.32 d17[1], r12 89b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vadd.i64 d17, d18 90b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vshl.s64 d17, d16 91b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org vst1.32 d17[0], [r0]! @ Store the output 92b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 93b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org ldr r8, [sp, #40] @ step_seq2 94b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org add r2, r8, lsl #1 @ prepare for seq2_ptr(r5) in the next loop. 95b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 96b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org subs r4, #1 97b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org bgt LOOP_DIM_CROSS_CORRELATION 98b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 99b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org pop {r4-r11} 100b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org bx lr 101b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org 102b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ TODO(kma): Place this piece of reference code into a C code file. 103c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ void WebRtcSpl_CrossCorrelationNeon(int32_t* cross_correlation, 104c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t* seq1, 105c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t* seq2, 106c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t dim_seq, 107c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t dim_cross_correlation, 108c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t right_shifts, 109c49ec1327c1d956c6c889cf1f32a7f9920d76149pbos@webrtc.org@ int16_t step_seq2) { 110b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int i = 0; 111b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int j = 0; 112b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int inner_loop_len1 = dim_seq >> 3; 113b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int inner_loop_len2 = dim_seq - (inner_loop_len1 << 3); 114b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@ 115b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ assert(dim_cross_correlation > 0); 116b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ assert(dim_seq > 0); 117b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@ 118b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ for (i = 0; i < dim_cross_correlation; i++) { 119b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int16_t *seq1_ptr = seq1; 120b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int16_t *seq2_ptr = seq2 + (step_seq2 * i); 121b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ int64_t sum = 0; 122b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@ 123b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ for (j = inner_loop_len1; j > 0; j -= 1) { 124b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 125b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 126b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 127b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 128b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 129b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 130b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 131b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 132b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 133b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 134b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 135b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 136b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 137b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 138b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 139b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 140b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 141b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 142b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 143b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 144b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 145b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 146b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 147b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 148b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ } 149b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@ 150b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ // Calculate the rest of the samples. 151b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ for (j = inner_loop_len2; j > 0; j -= 1) { 152b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ sum += WEBRTC_SPL_MUL_16_16(*seq1_ptr, *seq2_ptr); 153b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq1_ptr++; 154b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ seq2_ptr++; 155b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ } 156b238acaca55b4b345f0e37b82f8bbd9851c8bb6dkma@webrtc.org@ 157b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ *cross_correlation++ = (int32_t)(sum >> right_shifts); 158b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ } 159b015cbede88899f67a53fbbe581b02ce8e32794andrew@webrtc.org@ } 160