1c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/* Copyright (c) 2014-2015 Xiph.Org Foundation 2c91ee5b5642fcc4969150f73d5f6848f88bf1638flim Written by Viswanath Puttagunta */ 3c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/** 4c91ee5b5642fcc4969150f73d5f6848f88bf1638flim @file celt_neon_intr.c 5c91ee5b5642fcc4969150f73d5f6848f88bf1638flim @brief ARM Neon Intrinsic optimizations for celt 6c91ee5b5642fcc4969150f73d5f6848f88bf1638flim */ 7c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 8c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/* 9c91ee5b5642fcc4969150f73d5f6848f88bf1638flim Redistribution and use in source and binary forms, with or without 10c91ee5b5642fcc4969150f73d5f6848f88bf1638flim modification, are permitted provided that the following conditions 11c91ee5b5642fcc4969150f73d5f6848f88bf1638flim are met: 12c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 13c91ee5b5642fcc4969150f73d5f6848f88bf1638flim - Redistributions of source code must retain the above copyright 14c91ee5b5642fcc4969150f73d5f6848f88bf1638flim notice, this list of conditions and the following disclaimer. 15c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 16c91ee5b5642fcc4969150f73d5f6848f88bf1638flim - Redistributions in binary form must reproduce the above copyright 17c91ee5b5642fcc4969150f73d5f6848f88bf1638flim notice, this list of conditions and the following disclaimer in the 18c91ee5b5642fcc4969150f73d5f6848f88bf1638flim documentation and/or other materials provided with the distribution. 19c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 20c91ee5b5642fcc4969150f73d5f6848f88bf1638flim THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21c91ee5b5642fcc4969150f73d5f6848f88bf1638flim ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22c91ee5b5642fcc4969150f73d5f6848f88bf1638flim LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23c91ee5b5642fcc4969150f73d5f6848f88bf1638flim A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 24c91ee5b5642fcc4969150f73d5f6848f88bf1638flim OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25c91ee5b5642fcc4969150f73d5f6848f88bf1638flim EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26c91ee5b5642fcc4969150f73d5f6848f88bf1638flim PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27c91ee5b5642fcc4969150f73d5f6848f88bf1638flim PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28c91ee5b5642fcc4969150f73d5f6848f88bf1638flim LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29c91ee5b5642fcc4969150f73d5f6848f88bf1638flim NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31c91ee5b5642fcc4969150f73d5f6848f88bf1638flim*/ 32c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 33c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#ifdef HAVE_CONFIG_H 34c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "config.h" 35c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif 36c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 37c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include <arm_neon.h> 38c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#include "../pitch.h" 39c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 40d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim#if defined(FIXED_POINT) 41d03c373974c945b4b62b59b873522387418a2a3fFelicia Limvoid xcorr_kernel_neon_fixed(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[4], int len) 42d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim{ 43d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int j; 44d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a = vld1q_s32(sum); 45d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim /* Load y[0...3] */ 46d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim /* This requires len>0 to always be valid (which we assert in the C code). */ 47d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y0 = vld1_s16(y); 48d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim y += 4; 49d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 50d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim for (j = 0; j + 8 <= len; j += 8) 51d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim { 52d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim /* Load x[0...7] */ 53d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x8_t xx = vld1q_s16(x); 54d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t x0 = vget_low_s16(xx); 55d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t x4 = vget_high_s16(xx); 56d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim /* Load y[4...11] */ 57d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x8_t yy = vld1q_s16(y); 58d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y4 = vget_low_s16(yy); 59d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y8 = vget_high_s16(yy); 60d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a0 = vmlal_lane_s16(a, y0, x0, 0); 61d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a1 = vmlal_lane_s16(a0, y4, x4, 0); 62d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 63d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y1 = vext_s16(y0, y4, 1); 64d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y5 = vext_s16(y4, y8, 1); 65d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a2 = vmlal_lane_s16(a1, y1, x0, 1); 66d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a3 = vmlal_lane_s16(a2, y5, x4, 1); 67d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 68d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y2 = vext_s16(y0, y4, 2); 69d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y6 = vext_s16(y4, y8, 2); 70d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a4 = vmlal_lane_s16(a3, y2, x0, 2); 71d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a5 = vmlal_lane_s16(a4, y6, x4, 2); 72d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 73d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y3 = vext_s16(y0, y4, 3); 74d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y7 = vext_s16(y4, y8, 3); 75d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a6 = vmlal_lane_s16(a5, y3, x0, 3); 76d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a7 = vmlal_lane_s16(a6, y7, x4, 3); 77d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 78d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim y0 = y8; 79d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim a = a7; 80d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim x += 8; 81d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim y += 8; 82d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim } 83d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 84d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim for (; j < len; j++) 85d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim { 86d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t x0 = vld1_dup_s16(x); /* load next x */ 87d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int32x4_t a0 = vmlal_s16(a, y0, x0); 88d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 89d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim int16x4_t y4 = vld1_dup_s16(y); /* load next y */ 90d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim y0 = vext_s16(y0, y4, 1); 91d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim a = a0; 92d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim x++; 93d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim y++; 94d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim } 95d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 96d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim vst1q_s32(sum, a); 97d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim} 98d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim 99d03c373974c945b4b62b59b873522387418a2a3fFelicia Lim#else 100c91ee5b5642fcc4969150f73d5f6848f88bf1638flim/* 101c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * Function: xcorr_kernel_neon_float 102c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * --------------------------------- 103c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * Computes 4 correlation values and stores them in sum[4] 104c91ee5b5642fcc4969150f73d5f6848f88bf1638flim */ 105c91ee5b5642fcc4969150f73d5f6848f88bf1638flimstatic void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y, 106c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32_t sum[4], int len) { 107c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32x4_t YY[3]; 108c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32x4_t YEXT[3]; 109c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32x4_t XX[2]; 110c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32x2_t XX_2; 111c91ee5b5642fcc4969150f73d5f6848f88bf1638flim float32x4_t SUMM; 112c91ee5b5642fcc4969150f73d5f6848f88bf1638flim const float32_t *xi = x; 113c91ee5b5642fcc4969150f73d5f6848f88bf1638flim const float32_t *yi = y; 114c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 115c91ee5b5642fcc4969150f73d5f6848f88bf1638flim celt_assert(len>0); 116c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 117c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[0] = vld1q_f32(yi); 118c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vdupq_n_f32(0); 119c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 120c91ee5b5642fcc4969150f73d5f6848f88bf1638flim /* Consume 8 elements in x vector and 12 elements in y 121c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * vector. However, the 12'th element never really gets 122c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * touched in this loop. So, if len == 8, then we only 123c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * must access y[0] to y[10]. y[11] must not be accessed 124c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * hence make sure len > 8 and not len >= 8 125c91ee5b5642fcc4969150f73d5f6848f88bf1638flim */ 126c91ee5b5642fcc4969150f73d5f6848f88bf1638flim while (len > 8) { 127c91ee5b5642fcc4969150f73d5f6848f88bf1638flim yi += 4; 128c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[1] = vld1q_f32(yi); 129c91ee5b5642fcc4969150f73d5f6848f88bf1638flim yi += 4; 130c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[2] = vld1q_f32(yi); 131c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 132c91ee5b5642fcc4969150f73d5f6848f88bf1638flim XX[0] = vld1q_f32(xi); 133c91ee5b5642fcc4969150f73d5f6848f88bf1638flim xi += 4; 134c91ee5b5642fcc4969150f73d5f6848f88bf1638flim XX[1] = vld1q_f32(xi); 135c91ee5b5642fcc4969150f73d5f6848f88bf1638flim xi += 4; 136c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 137c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); 138c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[0] = vextq_f32(YY[0], YY[1], 1); 139c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); 140c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[1] = vextq_f32(YY[0], YY[1], 2); 141c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); 142c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[2] = vextq_f32(YY[0], YY[1], 3); 143c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); 144c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 145c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0); 146c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[0] = vextq_f32(YY[1], YY[2], 1); 147c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1); 148c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[1] = vextq_f32(YY[1], YY[2], 2); 149c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0); 150c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[2] = vextq_f32(YY[1], YY[2], 3); 151c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1); 152c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 153c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[0] = YY[2]; 154c91ee5b5642fcc4969150f73d5f6848f88bf1638flim len -= 8; 155c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 156c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 157c91ee5b5642fcc4969150f73d5f6848f88bf1638flim /* Consume 4 elements in x vector and 8 elements in y 158c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * vector. However, the 8'th element in y never really gets 159c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * touched in this loop. So, if len == 4, then we only 160c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * must access y[0] to y[6]. y[7] must not be accessed 161c91ee5b5642fcc4969150f73d5f6848f88bf1638flim * hence make sure len>4 and not len>=4 162c91ee5b5642fcc4969150f73d5f6848f88bf1638flim */ 163c91ee5b5642fcc4969150f73d5f6848f88bf1638flim if (len > 4) { 164c91ee5b5642fcc4969150f73d5f6848f88bf1638flim yi += 4; 165c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[1] = vld1q_f32(yi); 166c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 167c91ee5b5642fcc4969150f73d5f6848f88bf1638flim XX[0] = vld1q_f32(xi); 168c91ee5b5642fcc4969150f73d5f6848f88bf1638flim xi += 4; 169c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 170c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0); 171c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[0] = vextq_f32(YY[0], YY[1], 1); 172c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1); 173c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[1] = vextq_f32(YY[0], YY[1], 2); 174c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0); 175c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YEXT[2] = vextq_f32(YY[0], YY[1], 3); 176c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1); 177c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 178c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[0] = YY[1]; 179c91ee5b5642fcc4969150f73d5f6848f88bf1638flim len -= 4; 180c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 181c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 182c91ee5b5642fcc4969150f73d5f6848f88bf1638flim while (--len > 0) { 183c91ee5b5642fcc4969150f73d5f6848f88bf1638flim XX_2 = vld1_dup_f32(xi++); 184c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 185c91ee5b5642fcc4969150f73d5f6848f88bf1638flim YY[0]= vld1q_f32(++yi); 186c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 187c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 188c91ee5b5642fcc4969150f73d5f6848f88bf1638flim XX_2 = vld1_dup_f32(xi); 189c91ee5b5642fcc4969150f73d5f6848f88bf1638flim SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 190c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 191c91ee5b5642fcc4969150f73d5f6848f88bf1638flim vst1q_f32(sum, SUMM); 192c91ee5b5642fcc4969150f73d5f6848f88bf1638flim} 193c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 194c91ee5b5642fcc4969150f73d5f6848f88bf1638flimvoid celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, 1950c2090c324e4f2ba2a8621c8b083559bab74c7c5Felicia Lim opus_val32 *xcorr, int len, int max_pitch, int arch) { 196c91ee5b5642fcc4969150f73d5f6848f88bf1638flim int i; 1970c2090c324e4f2ba2a8621c8b083559bab74c7c5Felicia Lim (void)arch; 198c91ee5b5642fcc4969150f73d5f6848f88bf1638flim celt_assert(max_pitch > 0); 199c91ee5b5642fcc4969150f73d5f6848f88bf1638flim celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); 200c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 201c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (i = 0; i < (max_pitch-3); i += 4) { 202c91ee5b5642fcc4969150f73d5f6848f88bf1638flim xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i, 203c91ee5b5642fcc4969150f73d5f6848f88bf1638flim (float32_t *)xcorr+i, len); 204c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 205c91ee5b5642fcc4969150f73d5f6848f88bf1638flim 2060c2090c324e4f2ba2a8621c8b083559bab74c7c5Felicia Lim /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */ 207c91ee5b5642fcc4969150f73d5f6848f88bf1638flim for (; i < max_pitch; i++) { 2080c2090c324e4f2ba2a8621c8b083559bab74c7c5Felicia Lim xcorr[i] = celt_inner_prod_neon(_x, _y+i, len); 209c91ee5b5642fcc4969150f73d5f6848f88bf1638flim } 210c91ee5b5642fcc4969150f73d5f6848f88bf1638flim} 211c91ee5b5642fcc4969150f73d5f6848f88bf1638flim#endif 212