16adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha/* Copyright (C) 2007-2008 Jean-Marc Valin
26adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha * Copyright (C) 2008 Thorvald Natvig
36adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha * Copyright (C) 2011 Jyri Sarha, Texas Instruments
46adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha */
56adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha/**
66adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   @file resample_neon.h
76adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   @brief Resampler functions (NEON version)
86adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha*/
96adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha/*
106adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   Redistribution and use in source and binary forms, with or without
116adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   modification, are permitted provided that the following conditions
126adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   are met:
136adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
146adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   - Redistributions of source code must retain the above copyright
156adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   notice, this list of conditions and the following disclaimer.
166adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
176adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   - Redistributions in binary form must reproduce the above copyright
186adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   notice, this list of conditions and the following disclaimer in the
196adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   documentation and/or other materials provided with the distribution.
206adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
216adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   - Neither the name of the Xiph.org Foundation nor the names of its
226adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   contributors may be used to endorse or promote products derived from
236adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   this software without specific prior written permission.
246adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
256adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
266adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
276adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
286adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
296adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
306adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
316adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
326adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
336adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
346adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
356adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
366adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha*/
376adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
386adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha#include <arm_neon.h>
396adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha
406adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha#ifdef FIXED_POINT
41ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarhastatic inline int32_t saturate_32bit_to_16bit(int32_t a) {
42ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    int32_t ret;
43ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    asm volatile ("vmov.s32 d24[0], %[a]\n"
44ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha                  "vqmovn.s32 d24, q12\n"
45ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha                  "vmov.s16 %[ret], d24[0]\n"
46ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha                  : [ret] "=&r" (ret)
47ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha                  : [a] "r" (a)
48ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha                  : "q12", "d24", "d25" );
49ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    return ret;
50ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha}
51ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#undef WORD2INT
52ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#define WORD2INT(x) (saturate_32bit_to_16bit(x))
5335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha
546adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha#define OVERRIDE_INNER_PRODUCT_SINGLE
556adacb80950e35de9df0a1d7a060aba795712494Jyri Sarhastatic inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len)
566adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha{
576adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha    int32_t ret;
5835318dd943257760780f28b95b6ca99a79886c3dJyri Sarha    uint32_t remainder = len % 16;
5935318dd943257760780f28b95b6ca99a79886c3dJyri Sarha    len = len - remainder;
6035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha
6135318dd943257760780f28b95b6ca99a79886c3dJyri Sarha    asm volatile ("	 cmp %[len], #0\n"
6235318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 bne 1f\n"
6335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d16}, [%[a]]!\n"
6435318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d20}, [%[b]]!\n"
6535318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 subs %[remainder], %[remainder], #4\n"
6635318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmull.s16 q0, d16, d20\n"
6735318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "      beq 5f\n"
6835318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 b 4f\n"
6935318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "1:"
7035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d16, d17, d18, d19}, [%[a]]!\n"
7135318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d20, d21, d22, d23}, [%[b]]!\n"
7235318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 subs %[len], %[len], #16\n"
7335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmull.s16 q0, d16, d20\n"
7435318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d17, d21\n"
7535318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d18, d22\n"
7635318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d19, d23\n"
7735318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 beq 3f\n"
7835318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "2:"
7935318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d16, d17, d18, d19}, [%[a]]!\n"
8035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d20, d21, d22, d23}, [%[b]]!\n"
8135318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 subs %[len], %[len], #16\n"
8235318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d16, d20\n"
8335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d17, d21\n"
8435318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d18, d22\n"
8535318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d19, d23\n"
8635318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 bne 2b\n"
8735318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "3:"
8835318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 cmp %[remainder], #0\n"
8935318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 beq 5f\n"
9035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "4:"
9135318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d16}, [%[a]]!\n"
9235318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vld1.16 {d20}, [%[b]]!\n"
9335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 subs %[remainder], %[remainder], #4\n"
9435318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmlal.s16 q0, d16, d20\n"
9535318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 bne 4b\n"
9635318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "5:"
9735318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vaddl.s32 q0, d0, d1\n"
9835318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vadd.s64 d0, d0, d1\n"
9935318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vqmovn.s64 d0, q0\n"
10035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vqrshrn.s32 d0, q0, #15\n"
10135318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  "	 vmov.s16 %[ret], d0[0]\n"
10235318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
10335318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		    [len] "+r" (len), [remainder] "+r" (remainder)
10435318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  :
10535318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		  : "cc", "q0",
10635318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		    "d16", "d17", "d18", "d19",
10735318dd943257760780f28b95b6ca99a79886c3dJyri Sarha		    "d20", "d21", "d22", "d23");
1086adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha    return ret;
1096adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha}
11035318dd943257760780f28b95b6ca99a79886c3dJyri Sarha
111ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#elif defined(FLOATING_POINT)
112ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha
113ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarhastatic inline int32_t saturate_float_to_16bit(float a) {
114ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    int32_t ret;
115ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    asm ("vmov.f32 d24[0], %[a]\n"
116ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         "vcvt.s32.f32 d24, d24, #15\n"
117ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         "vqrshrn.s32 d24, q12, #15\n"
118ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         "vmov.s16 %[ret], d24[0]\n"
119ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         : [ret] "=&r" (ret)
120ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         : [a] "r" (a)
121ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha         : "q12", "d24", "d25" );
122ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    return ret;
123ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha}
124ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#undef WORD2INT
125ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#define WORD2INT(x) (saturate_float_to_16bit(x))
126ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha
127ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha#define OVERRIDE_INNER_PRODUCT_SINGLE
128ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarhastatic inline float inner_product_single(const float *a, const float *b, unsigned int len)
129ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha{
130ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    float ret;
131ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    uint32_t remainder = len % 16;
132ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    len = len - remainder;
133ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha
134ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    asm volatile ("	 cmp %[len], #0\n"
135ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 bne 1f\n"
136ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q4}, [%[a]]!\n"
137ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q8}, [%[b]]!\n"
138ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 subs %[remainder], %[remainder], #4\n"
139ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmul.f32 q0, q4, q8\n"
140ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "      beq 5f\n"
141ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 b 4f\n"
142ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "1:"
143ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q4, q5}, [%[a]]!\n"
144ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q8, q9}, [%[b]]!\n"
145ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q6, q7}, [%[a]]!\n"
146ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q10, q11}, [%[b]]!\n"
147ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 subs %[len], %[len], #16\n"
148ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmul.f32 q0, q4, q8\n"
149ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmul.f32 q1, q5, q9\n"
150ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmul.f32 q2, q6, q10\n"
151ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmul.f32 q3, q7, q11\n"
152ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 beq 3f\n"
153ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "2:"
154ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q4, q5}, [%[a]]!\n"
155ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q8, q9}, [%[b]]!\n"
156ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q6, q7}, [%[a]]!\n"
157ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q10, q11}, [%[b]]!\n"
158ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 subs %[len], %[len], #16\n"
159ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmla.f32 q0, q4, q8\n"
160ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmla.f32 q1, q5, q9\n"
161ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmla.f32 q2, q6, q10\n"
162ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmla.f32 q3, q7, q11\n"
163ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 bne 2b\n"
164ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "3:"
165ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vadd.f32 q4, q0, q1\n"
166ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vadd.f32 q5, q2, q3\n"
167ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vadd.f32 q0, q4, q5\n"
168ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 cmp %[remainder], #0\n"
169ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 beq 5f\n"
170ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "4:"
171ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q6}, [%[a]]!\n"
172ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vld1.32 {q10}, [%[b]]!\n"
173ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 subs %[remainder], %[remainder], #4\n"
174ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmla.f32 q0, q6, q10\n"
175ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 bne 4b\n"
176ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "5:"
177ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vadd.f32 d0, d0, d1\n"
178ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vpadd.f32 d0, d0, d0\n"
179ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  "	 vmov.f32 %[ret], d0[0]\n"
180ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b),
181ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		    [len] "+l" (len), [remainder] "+l" (remainder)
182ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  :
183ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha		  : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
184ebe6230a7f7c69f5a4389f2b09b7b19ef9e94f32Jyri Sarha		    "q9", "q10", "q11");
185ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha    return ret;
186ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha}
187ecb2da017f5b5101d9414b398aced34de623b9ebJyri Sarha
1886adacb80950e35de9df0a1d7a060aba795712494Jyri Sarha#endif
189