1a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin/* 2a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. 3a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * 4a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * Use of this source code is governed by a BSD-style license 5a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * that can be found in the LICENSE file in the root of the source 6a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * tree. An additional intellectual property rights grant can be found 7a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * in the file PATENTS. All contributing project authors may 8a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * be found in the AUTHORS file in the root of the source tree. 9a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin */ 10a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 11a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin/* 12a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * filters_neon.c 13a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * 14a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * This file contains function WebRtcIsacfix_AutocorrNeon, optimized for 15a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * ARM Neon platform. 16a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin * 17a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin */ 18a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 19a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include <arm_neon.h> 20a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include <assert.h> 21a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 22a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin#include "codec.h" 23a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 24a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin// Autocorrelation function in fixed point. 25a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin// NOTE! Different from SPLIB-version in how it scales the signal. 26a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkinint WebRtcIsacfix_AutocorrNeon( 27a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin WebRtc_Word32* __restrict r, 28a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin const WebRtc_Word16* __restrict x, 29a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin WebRtc_Word16 N, 30a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin WebRtc_Word16 order, 31a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin WebRtc_Word16* __restrict scale) { 32a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 33a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // The 1st for loop assumed N % 4 == 0. 34a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin assert(N % 4 == 0); 35a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 36a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int i = 0; 37a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int zeros_low = 0; 38a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int zeros_high = 0; 39a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int16_t scaling = 0; 40a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int32_t sum = 0; 41a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 42a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Step 1, calculate r[0] and how much scaling is needed. 43a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 44a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int16x4_t reg16x4; 45a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int64x1_t reg64x1a; 46a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int64x1_t reg64x1b; 47a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int32x4_t reg32x4; 48a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int64x2_t reg64x2 = vdupq_n_s64(0); // zeros 49a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 50a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Loop over the samples and do: 51a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]); 52a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (i = 0; i < N; i += 4) { 53a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg16x4 = vld1_s16(&x[i]); 54a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg32x4 = vmull_s16(reg16x4, reg16x4); 55a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x2 = vpadalq_s32(reg64x2, reg32x4); 56a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 57a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x1a = vget_low_s64(reg64x2); 58a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x1b = vget_high_s64(reg64x2); 59a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x1a = vadd_s64(reg64x1a, reg64x1b); 60a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 61a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Calculate the value of shifting (scaling). 62a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin __asm__ __volatile__( 63a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmov %[z_l], %[z_h], %P[reg]\n\t" 64a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "clz %[z_l], %[z_l]\n\t" 65a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "clz %[z_h], %[z_h]\n\t" 66a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[z_l]"+r"(zeros_low), 67a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [z_h]"+r"(zeros_high) 68a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[reg]"w"(reg64x1a) 69a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin ); 70a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin if (zeros_high != 32) { 71a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin scaling = (32 - zeros_high + 1); 72a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } else if (zeros_low == 0) { 73a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin scaling = 1; 74a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 75a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x1b = -scaling; 76a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin reg64x1a = vshl_s64(reg64x1a, reg64x1b); 77a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 78a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Record the result. 79a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin r[0] = (int32_t)vget_lane_s64(reg64x1a, 0); 80a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 81a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 82a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Step 2, perform the actual correlation calculation. 83a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 84a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin /* Original C code (for the rest of the function): 85a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (i = 1; i < order + 1; i++) { 86a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin prod = 0; 87a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (j = 0; j < N - i; j++) { 88a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]); 89a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 90a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin sum = (int32_t)(prod >> scaling); 91a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin r[i] = sum; 92a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 93a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin */ 94a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 95a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (i = 1; i < order + 1; i++) { 96a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int32_t prod_lower = 0; 97a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int32_t prod_upper = 0; 98a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin const int16_t* ptr0 = &x[0]; 99a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin const int16_t* ptr1 = &x[i]; 100a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin int32_t tmp = 0; 101a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 102a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Initialize the sum (q9) to zero. 103a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9"); 104a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 105a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Calculate the major block of the samples (a multiple of 8). 106a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (; ptr0 < &x[N - i - 7];) { 107a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin __asm__ __volatile__( 108a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vld1.16 {d20, d21}, [%[ptr0]]!\n\t" 109a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vld1.16 {d22, d23}, [%[ptr1]]!\n\t" 110a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmull.s16 q12, d20, d22\n\t" 111a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmull.s16 q13, d21, d23\n\t" 112a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vpadal.s32 q9, q12\n\t" 113a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vpadal.s32 q9, q13\n\t" 114a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 115a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Specify constraints. 116a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[ptr0]"+r"(ptr0), 117a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [ptr1]"+r"(ptr1) 118a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin : 119a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27" 120a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin ); 121a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 122a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 123a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Calculate the rest of the samples. 124a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin for (; ptr0 < &x[N - i]; ptr0++, ptr1++) { 125a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin __asm__ __volatile__( 126a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "smulbb %[tmp], %[ptr0], %[ptr1]\n\t" 127a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "adds %[prod_lower], %[prod_lower], %[tmp]\n\t" 128a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t" 129a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 130a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Specify constraints. 131a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[prod_lower]"+r"(prod_lower), 132a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [prod_upper]"+r"(prod_upper), 133a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [tmp]"+r"(tmp) 134a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[ptr0]"r"(*ptr0), 135a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [ptr1]"r"(*ptr1) 136a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin ); 137a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 138a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 139a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Sum the results up, and do shift. 140a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin __asm__ __volatile__( 141a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vadd.i64 d18, d19\n\t" 142a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmov.32 d17[0], %[prod_lower]\n\t" 143a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmov.32 d17[1], %[prod_upper]\n\t" 144a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vadd.i64 d17, d18\n\t" 145a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "mov %[tmp], %[scaling], asr #31\n\t" 146a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmov.32 d16, %[scaling], %[tmp]\n\t" 147a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vshl.s64 d17, d16\n\t" 148a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin "vmov.32 %[sum], d17[0]\n\t" 149a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 150a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Specify constraints. 151a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[sum]"=r"(sum), 152a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [tmp]"+r"(tmp) 153a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :[prod_upper]"r"(prod_upper), 154a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [prod_lower]"r"(prod_lower), 155a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin [scaling]"r"(-scaling) 156a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin :"d16", "d17", "d18", "d19" 157a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin ); 158a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 159a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Record the result. 160a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin r[i] = sum; 161a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin } 162a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 163a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin // Record the result. 164a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin *scale = scaling; 165a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin 166a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin return(order + 1); 167a6451827d543eb00824bc95097e47d0aac51ae93Alexander Gutkin} 168