1/*
2 *
3 *  Bluetooth low-complexity, subband codec (SBC) library
4 *
5 *  Copyright (C) 2008-2010  Nokia Corporation
6 *  Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
7 *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
8 *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
9 *
10 *
11 *  This library is free software; you can redistribute it and/or
12 *  modify it under the terms of the GNU Lesser General Public
13 *  License as published by the Free Software Foundation; either
14 *  version 2.1 of the License, or (at your option) any later version.
15 *
16 *  This library is distributed in the hope that it will be useful,
17 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 *  Lesser General Public License for more details.
20 *
21 *  You should have received a copy of the GNU Lesser General Public
22 *  License along with this library; if not, write to the Free Software
23 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
24 *
25 */
26
27#include <stdint.h>
28#include <limits.h>
29#include "sbc.h"
30#include "sbc_math.h"
31#include "sbc_tables.h"
32
33#include "sbc_primitives_neon.h"
34
35/*
36 * ARM NEON optimizations
37 */
38
39#ifdef SBC_BUILD_WITH_NEON_SUPPORT
40
41static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out,
42							const FIXED_T *consts)
43{
44	/* TODO: merge even and odd cases (or even merge all four calls to this
45	 * function) in order to have only aligned reads from 'in' array
46	 * and reduce number of load instructions */
47	asm volatile (
48		"vld1.16    {d4, d5}, [%0, :64]!\n"
49		"vld1.16    {d8, d9}, [%1, :128]!\n"
50
51		"vmull.s16  q0, d4, d8\n"
52		"vld1.16    {d6,  d7}, [%0, :64]!\n"
53		"vmull.s16  q1, d5, d9\n"
54		"vld1.16    {d10, d11}, [%1, :128]!\n"
55
56		"vmlal.s16  q0, d6, d10\n"
57		"vld1.16    {d4, d5}, [%0, :64]!\n"
58		"vmlal.s16  q1, d7, d11\n"
59		"vld1.16    {d8, d9}, [%1, :128]!\n"
60
61		"vmlal.s16  q0, d4, d8\n"
62		"vld1.16    {d6,  d7}, [%0, :64]!\n"
63		"vmlal.s16  q1, d5, d9\n"
64		"vld1.16    {d10, d11}, [%1, :128]!\n"
65
66		"vmlal.s16  q0, d6, d10\n"
67		"vld1.16    {d4, d5}, [%0, :64]!\n"
68		"vmlal.s16  q1, d7, d11\n"
69		"vld1.16    {d8, d9}, [%1, :128]!\n"
70
71		"vmlal.s16  q0, d4, d8\n"
72		"vmlal.s16  q1, d5, d9\n"
73
74		"vpadd.s32  d0, d0, d1\n"
75		"vpadd.s32  d1, d2, d3\n"
76
77		"vrshrn.s32 d0, q0, %3\n"
78
79		"vld1.16    {d2, d3, d4, d5}, [%1, :128]!\n"
80
81		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
82		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
83
84		"vmull.s16  q3, d2, d0\n"
85		"vmull.s16  q4, d3, d0\n"
86		"vmlal.s16  q3, d4, d1\n"
87		"vmlal.s16  q4, d5, d1\n"
88
89		"vpadd.s32  d0, d6, d7\n" /* TODO: can be eliminated */
90		"vpadd.s32  d1, d8, d9\n" /* TODO: can be eliminated */
91
92		"vst1.32    {d0, d1}, [%2, :128]\n"
93		: "+r" (in), "+r" (consts)
94		: "r" (out),
95			"i" (SBC_PROTO_FIXED4_SCALE)
96		: "memory",
97			"d0", "d1", "d2", "d3", "d4", "d5",
98			"d6", "d7", "d8", "d9", "d10", "d11");
99}
100
101static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out,
102							const FIXED_T *consts)
103{
104	/* TODO: merge even and odd cases (or even merge all four calls to this
105	 * function) in order to have only aligned reads from 'in' array
106	 * and reduce number of load instructions */
107	asm volatile (
108		"vld1.16    {d4, d5}, [%0, :64]!\n"
109		"vld1.16    {d8, d9}, [%1, :128]!\n"
110
111		"vmull.s16  q6, d4, d8\n"
112		"vld1.16    {d6,  d7}, [%0, :64]!\n"
113		"vmull.s16  q7, d5, d9\n"
114		"vld1.16    {d10, d11}, [%1, :128]!\n"
115		"vmull.s16  q8, d6, d10\n"
116		"vld1.16    {d4, d5}, [%0, :64]!\n"
117		"vmull.s16  q9, d7, d11\n"
118		"vld1.16    {d8, d9}, [%1, :128]!\n"
119
120		"vmlal.s16  q6, d4, d8\n"
121		"vld1.16    {d6,  d7}, [%0, :64]!\n"
122		"vmlal.s16  q7, d5, d9\n"
123		"vld1.16    {d10, d11}, [%1, :128]!\n"
124		"vmlal.s16  q8, d6, d10\n"
125		"vld1.16    {d4, d5}, [%0, :64]!\n"
126		"vmlal.s16  q9, d7, d11\n"
127		"vld1.16    {d8, d9}, [%1, :128]!\n"
128
129		"vmlal.s16  q6, d4, d8\n"
130		"vld1.16    {d6,  d7}, [%0, :64]!\n"
131		"vmlal.s16  q7, d5, d9\n"
132		"vld1.16    {d10, d11}, [%1, :128]!\n"
133		"vmlal.s16  q8, d6, d10\n"
134		"vld1.16    {d4, d5}, [%0, :64]!\n"
135		"vmlal.s16  q9, d7, d11\n"
136		"vld1.16    {d8, d9}, [%1, :128]!\n"
137
138		"vmlal.s16  q6, d4, d8\n"
139		"vld1.16    {d6,  d7}, [%0, :64]!\n"
140		"vmlal.s16  q7, d5, d9\n"
141		"vld1.16    {d10, d11}, [%1, :128]!\n"
142		"vmlal.s16  q8, d6, d10\n"
143		"vld1.16    {d4, d5}, [%0, :64]!\n"
144		"vmlal.s16  q9, d7, d11\n"
145		"vld1.16    {d8, d9}, [%1, :128]!\n"
146
147		"vmlal.s16  q6, d4, d8\n"
148		"vld1.16    {d6,  d7}, [%0, :64]!\n"
149		"vmlal.s16  q7, d5, d9\n"
150		"vld1.16    {d10, d11}, [%1, :128]!\n"
151
152		"vmlal.s16  q8, d6, d10\n"
153		"vmlal.s16  q9, d7, d11\n"
154
155		"vpadd.s32  d0, d12, d13\n"
156		"vpadd.s32  d1, d14, d15\n"
157		"vpadd.s32  d2, d16, d17\n"
158		"vpadd.s32  d3, d18, d19\n"
159
160		"vrshr.s32 q0, q0, %3\n"
161		"vrshr.s32 q1, q1, %3\n"
162		"vmovn.s32 d0, q0\n"
163		"vmovn.s32 d1, q1\n"
164
165		"vdup.i32   d3, d1[1]\n"  /* TODO: can be eliminated */
166		"vdup.i32   d2, d1[0]\n"  /* TODO: can be eliminated */
167		"vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
168		"vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
169
170		"vld1.16    {d4, d5}, [%1, :128]!\n"
171		"vmull.s16  q6, d4, d0\n"
172		"vld1.16    {d6, d7}, [%1, :128]!\n"
173		"vmull.s16  q7, d5, d0\n"
174		"vmull.s16  q8, d6, d0\n"
175		"vmull.s16  q9, d7, d0\n"
176
177		"vld1.16    {d4, d5}, [%1, :128]!\n"
178		"vmlal.s16  q6, d4, d1\n"
179		"vld1.16    {d6, d7}, [%1, :128]!\n"
180		"vmlal.s16  q7, d5, d1\n"
181		"vmlal.s16  q8, d6, d1\n"
182		"vmlal.s16  q9, d7, d1\n"
183
184		"vld1.16    {d4, d5}, [%1, :128]!\n"
185		"vmlal.s16  q6, d4, d2\n"
186		"vld1.16    {d6, d7}, [%1, :128]!\n"
187		"vmlal.s16  q7, d5, d2\n"
188		"vmlal.s16  q8, d6, d2\n"
189		"vmlal.s16  q9, d7, d2\n"
190
191		"vld1.16    {d4, d5}, [%1, :128]!\n"
192		"vmlal.s16  q6, d4, d3\n"
193		"vld1.16    {d6, d7}, [%1, :128]!\n"
194		"vmlal.s16  q7, d5, d3\n"
195		"vmlal.s16  q8, d6, d3\n"
196		"vmlal.s16  q9, d7, d3\n"
197
198		"vpadd.s32  d0, d12, d13\n" /* TODO: can be eliminated */
199		"vpadd.s32  d1, d14, d15\n" /* TODO: can be eliminated */
200		"vpadd.s32  d2, d16, d17\n" /* TODO: can be eliminated */
201		"vpadd.s32  d3, d18, d19\n" /* TODO: can be eliminated */
202
203		"vst1.32    {d0, d1, d2, d3}, [%2, :128]\n"
204		: "+r" (in), "+r" (consts)
205		: "r" (out),
206			"i" (SBC_PROTO_FIXED8_SCALE)
207		: "memory",
208			"d0", "d1", "d2", "d3", "d4", "d5",
209			"d6", "d7", "d8", "d9", "d10", "d11",
210			"d12", "d13", "d14", "d15", "d16", "d17",
211			"d18", "d19");
212}
213
214static inline void sbc_analyze_4b_4s_neon(int16_t *x,
215						int32_t *out, int out_stride)
216{
217	/* Analyze blocks */
218	_sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd);
219	out += out_stride;
220	_sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even);
221	out += out_stride;
222	_sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd);
223	out += out_stride;
224	_sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even);
225}
226
227static inline void sbc_analyze_4b_8s_neon(int16_t *x,
228						int32_t *out, int out_stride)
229{
230	/* Analyze blocks */
231	_sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd);
232	out += out_stride;
233	_sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even);
234	out += out_stride;
235	_sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd);
236	out += out_stride;
237	_sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
238}
239
240static void sbc_calc_scalefactors_neon(
241	int32_t sb_sample_f[16][2][8],
242	uint32_t scale_factor[2][8],
243	int blocks, int channels, int subbands)
244{
245	int ch, sb;
246	for (ch = 0; ch < channels; ch++) {
247		for (sb = 0; sb < subbands; sb += 4) {
248			int blk = blocks;
249			int32_t *in = &sb_sample_f[0][ch][sb];
250			asm volatile (
251				"vmov.s32  q0, #0\n"
252				"vmov.s32  q1, %[c1]\n"
253				"vmov.s32  q14, #1\n"
254				"vmov.s32  q15, %[c2]\n"
255				"vadd.s32  q1, q1, q14\n"
256			"1:\n"
257				"vld1.32   {d16, d17}, [%[in], :128], %[inc]\n"
258				"vabs.s32  q8,  q8\n"
259				"vld1.32   {d18, d19}, [%[in], :128], %[inc]\n"
260				"vabs.s32  q9,  q9\n"
261				"vld1.32   {d20, d21}, [%[in], :128], %[inc]\n"
262				"vabs.s32  q10, q10\n"
263				"vld1.32   {d22, d23}, [%[in], :128], %[inc]\n"
264				"vabs.s32  q11, q11\n"
265				"vmax.s32  q0,  q0,  q8\n"
266				"vmax.s32  q1,  q1,  q9\n"
267				"vmax.s32  q0,  q0,  q10\n"
268				"vmax.s32  q1,  q1,  q11\n"
269				"subs      %[blk], %[blk], #4\n"
270				"bgt       1b\n"
271				"vmax.s32  q0,  q0,  q1\n"
272				"vsub.s32  q0,  q0,  q14\n"
273				"vclz.s32  q0,  q0\n"
274				"vsub.s32  q0,  q15, q0\n"
275				"vst1.32   {d0, d1}, [%[out], :128]\n"
276			:
277			  [blk]    "+r" (blk),
278			  [in]     "+r" (in)
279			:
280			  [inc]     "r" ((char *) &sb_sample_f[1][0][0] -
281					 (char *) &sb_sample_f[0][0][0]),
282			  [out]     "r" (&scale_factor[ch][sb]),
283			  [c1]      "i" (1 << SCALE_OUT_BITS),
284			  [c2]      "i" (31 - SCALE_OUT_BITS)
285			: "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19",
286			  "d20", "d21", "d22", "d23", "d24", "d25", "d26",
287			  "d27", "d28", "d29", "d30", "d31", "cc", "memory");
288		}
289	}
290}
291
292int sbc_calc_scalefactors_j_neon(
293	int32_t sb_sample_f[16][2][8],
294	uint32_t scale_factor[2][8],
295	int blocks, int subbands)
296{
297	static SBC_ALIGNED int32_t joint_bits_mask[8] = {
298		8,   4,  2,  1, 128, 64, 32, 16
299	};
300	int joint, i;
301	int32_t  *in0, *in1;
302	int32_t  *in = &sb_sample_f[0][0][0];
303	uint32_t *out0, *out1;
304	uint32_t *out = &scale_factor[0][0];
305	int32_t  *consts = joint_bits_mask;
306
307	i = subbands;
308
309	asm volatile (
310		/*
311		 * constants: q13 = (31 - SCALE_OUT_BITS), q14 = 1
312		 * input:     q0  = ((1 << SCALE_OUT_BITS) + 1)
313		 *            %[in0] - samples for channel 0
314		 *            %[in1] - samples for shannel 1
315		 * output:    q0, q1 - scale factors without joint stereo
316		 *            q2, q3 - scale factors with joint stereo
317		 *            q15    - joint stereo selection mask
318		 */
319		".macro calc_scalefactors\n"
320			"vmov.s32  q1, q0\n"
321			"vmov.s32  q2, q0\n"
322			"vmov.s32  q3, q0\n"
323			"mov       %[i], %[blocks]\n"
324		"1:\n"
325			"vld1.32   {d18, d19}, [%[in1], :128], %[inc]\n"
326			"vbic.s32  q11, q9,  q14\n"
327			"vld1.32   {d16, d17}, [%[in0], :128], %[inc]\n"
328			"vhadd.s32 q10, q8,  q11\n"
329			"vhsub.s32 q11, q8,  q11\n"
330			"vabs.s32  q8,  q8\n"
331			"vabs.s32  q9,  q9\n"
332			"vabs.s32  q10, q10\n"
333			"vabs.s32  q11, q11\n"
334			"vmax.s32  q0,  q0,  q8\n"
335			"vmax.s32  q1,  q1,  q9\n"
336			"vmax.s32  q2,  q2,  q10\n"
337			"vmax.s32  q3,  q3,  q11\n"
338			"subs      %[i], %[i], #1\n"
339			"bgt       1b\n"
340			"vsub.s32  q0,  q0,  q14\n"
341			"vsub.s32  q1,  q1,  q14\n"
342			"vsub.s32  q2,  q2,  q14\n"
343			"vsub.s32  q3,  q3,  q14\n"
344			"vclz.s32  q0,  q0\n"
345			"vclz.s32  q1,  q1\n"
346			"vclz.s32  q2,  q2\n"
347			"vclz.s32  q3,  q3\n"
348			"vsub.s32  q0,  q13, q0\n"
349			"vsub.s32  q1,  q13, q1\n"
350			"vsub.s32  q2,  q13, q2\n"
351			"vsub.s32  q3,  q13, q3\n"
352		".endm\n"
353		/*
354		 * constants: q14 = 1
355		 * input: q15    - joint stereo selection mask
356		 *        %[in0] - value set by calc_scalefactors macro
357		 *        %[in1] - value set by calc_scalefactors macro
358		 */
359		".macro update_joint_stereo_samples\n"
360			"sub       %[out1], %[in1], %[inc]\n"
361			"sub       %[out0], %[in0], %[inc]\n"
362			"sub       %[in1], %[in1], %[inc], asl #1\n"
363			"sub       %[in0], %[in0], %[inc], asl #1\n"
364			"vld1.32   {d18, d19}, [%[in1], :128]\n"
365			"vbic.s32  q11, q9,  q14\n"
366			"vld1.32   {d16, d17}, [%[in0], :128]\n"
367			"vld1.32   {d2, d3}, [%[out1], :128]\n"
368			"vbic.s32  q3,  q1,  q14\n"
369			"vld1.32   {d0, d1}, [%[out0], :128]\n"
370			"vhsub.s32 q10, q8,  q11\n"
371			"vhadd.s32 q11, q8,  q11\n"
372			"vhsub.s32 q2,  q0,  q3\n"
373			"vhadd.s32 q3,  q0,  q3\n"
374			"vbif.s32  q10, q9,  q15\n"
375			"vbif.s32  d22, d16, d30\n"
376			"sub       %[inc], %[zero], %[inc], asl #1\n"
377			"sub       %[i], %[blocks], #2\n"
378		"2:\n"
379			"vbif.s32  d23, d17, d31\n"
380			"vst1.32   {d20, d21}, [%[in1], :128], %[inc]\n"
381			"vbif.s32  d4,  d2,  d30\n"
382			"vld1.32   {d18, d19}, [%[in1], :128]\n"
383			"vbif.s32  d5,  d3,  d31\n"
384			"vst1.32   {d22, d23}, [%[in0], :128], %[inc]\n"
385			"vbif.s32  d6,  d0,  d30\n"
386			"vld1.32   {d16, d17}, [%[in0], :128]\n"
387			"vbif.s32  d7,  d1,  d31\n"
388			"vst1.32   {d4, d5}, [%[out1], :128], %[inc]\n"
389			"vbic.s32  q11, q9,  q14\n"
390			"vld1.32   {d2, d3}, [%[out1], :128]\n"
391			"vst1.32   {d6, d7}, [%[out0], :128], %[inc]\n"
392			"vbic.s32  q3,  q1,  q14\n"
393			"vld1.32   {d0, d1}, [%[out0], :128]\n"
394			"vhsub.s32 q10, q8,  q11\n"
395			"vhadd.s32 q11, q8,  q11\n"
396			"vhsub.s32 q2,  q0,  q3\n"
397			"vhadd.s32 q3,  q0,  q3\n"
398			"vbif.s32  q10, q9,  q15\n"
399			"vbif.s32  d22, d16, d30\n"
400			"subs      %[i], %[i], #2\n"
401			"bgt       2b\n"
402			"sub       %[inc], %[zero], %[inc], asr #1\n"
403			"vbif.s32  d23, d17, d31\n"
404			"vst1.32   {d20, d21}, [%[in1], :128]\n"
405			"vbif.s32  q2,  q1,  q15\n"
406			"vst1.32   {d22, d23}, [%[in0], :128]\n"
407			"vbif.s32  q3,  q0,  q15\n"
408			"vst1.32   {d4, d5}, [%[out1], :128]\n"
409			"vst1.32   {d6, d7}, [%[out0], :128]\n"
410		".endm\n"
411
412		"vmov.s32  q14, #1\n"
413		"vmov.s32  q13, %[c2]\n"
414
415		"cmp   %[i], #4\n"
416		"bne   8f\n"
417
418	"4:\n" /* 4 subbands */
419		"add   %[in0], %[in], #0\n"
420		"add   %[in1], %[in], #32\n"
421		"add   %[out0], %[out], #0\n"
422		"add   %[out1], %[out], #32\n"
423		"vmov.s32  q0, %[c1]\n"
424		"vadd.s32  q0, q0, q14\n"
425
426		"calc_scalefactors\n"
427
428		/* check whether to use joint stereo for subbands 0, 1, 2 */
429		"vadd.s32  q15, q0,  q1\n"
430		"vadd.s32  q9,  q2,  q3\n"
431		"vmov.s32  d31[1], %[zero]\n" /* last subband -> no joint */
432		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
433		"vcgt.s32  q15, q15, q9\n"
434
435		/* calculate and save to memory 'joint' variable */
436		/* update and save scale factors to memory */
437		"  vand.s32  q8, q8, q15\n"
438		"vbit.s32  q0,  q2,  q15\n"
439		"  vpadd.s32 d16, d16, d17\n"
440		"vbit.s32  q1,  q3,  q15\n"
441		"  vpadd.s32 d16, d16, d16\n"
442		"vst1.32   {d0, d1}, [%[out0], :128]\n"
443		"vst1.32   {d2, d3}, [%[out1], :128]\n"
444		"  vst1.32   {d16[0]}, [%[joint]]\n"
445
446		"update_joint_stereo_samples\n"
447		"b     9f\n"
448
449	"8:\n" /* 8 subbands */
450		"add   %[in0], %[in], #16\n\n"
451		"add   %[in1], %[in], #48\n"
452		"add   %[out0], %[out], #16\n\n"
453		"add   %[out1], %[out], #48\n"
454		"vmov.s32  q0, %[c1]\n"
455		"vadd.s32  q0, q0, q14\n"
456
457		"calc_scalefactors\n"
458
459		/* check whether to use joint stereo for subbands 4, 5, 6 */
460		"vadd.s32  q15, q0,  q1\n"
461		"vadd.s32  q9,  q2,  q3\n"
462		"vmov.s32  d31[1], %[zero]\n"  /* last subband -> no joint */
463		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
464		"vcgt.s32  q15, q15, q9\n"
465
466		/* calculate part of 'joint' variable and save it to d24 */
467		/* update and save scale factors to memory */
468		"  vand.s32  q8, q8, q15\n"
469		"vbit.s32  q0,  q2,  q15\n"
470		"  vpadd.s32 d16, d16, d17\n"
471		"vbit.s32  q1,  q3,  q15\n"
472		"vst1.32   {d0, d1}, [%[out0], :128]\n"
473		"vst1.32   {d2, d3}, [%[out1], :128]\n"
474		"  vpadd.s32 d24, d16, d16\n"
475
476		"update_joint_stereo_samples\n"
477
478		"add   %[in0], %[in], #0\n"
479		"add   %[in1], %[in], #32\n"
480		"add   %[out0], %[out], #0\n\n"
481		"add   %[out1], %[out], #32\n"
482		"vmov.s32  q0, %[c1]\n"
483		"vadd.s32  q0, q0, q14\n"
484
485		"calc_scalefactors\n"
486
487		/* check whether to use joint stereo for subbands 0, 1, 2, 3 */
488		"vadd.s32  q15, q0,  q1\n"
489		"vadd.s32  q9,  q2,  q3\n"
490		"vld1.32   {d16, d17}, [%[consts], :128]!\n"
491		"vcgt.s32  q15, q15, q9\n"
492
493		/* combine last part of 'joint' with d24 and save to memory */
494		/* update and save scale factors to memory */
495		"  vand.s32  q8, q8, q15\n"
496		"vbit.s32  q0,  q2,  q15\n"
497		"  vpadd.s32 d16, d16, d17\n"
498		"vbit.s32  q1,  q3,  q15\n"
499		"  vpadd.s32 d16, d16, d16\n"
500		"vst1.32   {d0, d1}, [%[out0], :128]\n"
501		"  vadd.s32  d16, d16, d24\n"
502		"vst1.32   {d2, d3}, [%[out1], :128]\n"
503		"  vst1.32   {d16[0]}, [%[joint]]\n"
504
505		"update_joint_stereo_samples\n"
506	"9:\n"
507		".purgem calc_scalefactors\n"
508		".purgem update_joint_stereo_samples\n"
509		:
510		  [i]      "+&r" (i),
511		  [in]     "+&r" (in),
512		  [in0]    "=&r" (in0),
513		  [in1]    "=&r" (in1),
514		  [out]    "+&r" (out),
515		  [out0]   "=&r" (out0),
516		  [out1]   "=&r" (out1),
517		  [consts] "+&r" (consts)
518		:
519		  [inc]      "r" ((char *) &sb_sample_f[1][0][0] -
520				 (char *) &sb_sample_f[0][0][0]),
521		  [blocks]   "r" (blocks),
522		  [joint]    "r" (&joint),
523		  [c1]       "i" (1 << SCALE_OUT_BITS),
524		  [c2]       "i" (31 - SCALE_OUT_BITS),
525		  [zero]     "r" (0)
526		: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
527		  "d16", "d17", "d18", "d19", "d20", "d21", "d22",
528		  "d23", "d24", "d25", "d26", "d27", "d28", "d29",
529		  "d30", "d31", "cc", "memory");
530
531	return joint;
532}
533
534#define PERM_BE(a, b, c, d) {             \
535		(a * 2) + 1, (a * 2) + 0, \
536		(b * 2) + 1, (b * 2) + 0, \
537		(c * 2) + 1, (c * 2) + 0, \
538		(d * 2) + 1, (d * 2) + 0  \
539	}
540#define PERM_LE(a, b, c, d) {             \
541		(a * 2) + 0, (a * 2) + 1, \
542		(b * 2) + 0, (b * 2) + 1, \
543		(c * 2) + 0, (c * 2) + 1, \
544		(d * 2) + 0, (d * 2) + 1  \
545	}
546
547static SBC_ALWAYS_INLINE int sbc_enc_process_input_4s_neon_internal(
548	int position,
549	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
550	int nsamples, int nchannels, int big_endian)
551{
552	static SBC_ALIGNED uint8_t perm_be[2][8] = {
553		PERM_BE(7, 3, 6, 4),
554		PERM_BE(0, 2, 1, 5)
555	};
556	static SBC_ALIGNED uint8_t perm_le[2][8] = {
557		PERM_LE(7, 3, 6, 4),
558		PERM_LE(0, 2, 1, 5)
559	};
560	/* handle X buffer wraparound */
561	if (position < nsamples) {
562		int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 40];
563		int16_t *src = &X[0][position];
564		asm volatile (
565			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
566			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
567			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
568			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
569			"vld1.16 {d0}, [%[src], :64]!\n"
570			"vst1.16 {d0}, [%[dst], :64]!\n"
571			:
572			  [dst] "+r" (dst),
573			  [src] "+r" (src)
574			: : "memory", "d0", "d1", "d2", "d3");
575		if (nchannels > 1) {
576			dst = &X[1][SBC_X_BUFFER_SIZE - 40];
577			src = &X[1][position];
578			asm volatile (
579				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
580				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
581				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
582				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
583				"vld1.16 {d0}, [%[src], :64]!\n"
584				"vst1.16 {d0}, [%[dst], :64]!\n"
585				:
586				  [dst] "+r" (dst),
587				  [src] "+r" (src)
588				: : "memory", "d0", "d1", "d2", "d3");
589		}
590		position = SBC_X_BUFFER_SIZE - 40;
591	}
592
593	if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
594		/* poor 'pcm' alignment */
595		int16_t *x = &X[0][position];
596		int16_t *y = &X[1][position];
597		asm volatile (
598			"vld1.8  {d0, d1}, [%[perm], :128]\n"
599		"1:\n"
600			"sub     %[x], %[x], #16\n"
601			"sub     %[y], %[y], #16\n"
602			"sub     %[position], %[position], #8\n"
603			"vld1.8  {d4, d5}, [%[pcm]]!\n"
604			"vuzp.16 d4,  d5\n"
605			"vld1.8  {d20, d21}, [%[pcm]]!\n"
606			"vuzp.16 d20, d21\n"
607			"vswp    d5,  d20\n"
608			"vtbl.8  d16, {d4, d5}, d0\n"
609			"vtbl.8  d17, {d4, d5}, d1\n"
610			"vtbl.8  d18, {d20, d21}, d0\n"
611			"vtbl.8  d19, {d20, d21}, d1\n"
612			"vst1.16 {d16, d17}, [%[x], :128]\n"
613			"vst1.16 {d18, d19}, [%[y], :128]\n"
614			"subs    %[nsamples], %[nsamples], #8\n"
615			"bgt     1b\n"
616			:
617			  [x]        "+r" (x),
618			  [y]        "+r" (y),
619			  [pcm]      "+r" (pcm),
620			  [nsamples] "+r" (nsamples),
621			  [position] "+r" (position)
622			:
623			  [perm]      "r" (big_endian ? perm_be : perm_le)
624			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
625			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
626			  "d20", "d21", "d22", "d23");
627	} else if (nchannels > 1) {
628		/* proper 'pcm' alignment */
629		int16_t *x = &X[0][position];
630		int16_t *y = &X[1][position];
631		asm volatile (
632			"vld1.8  {d0, d1}, [%[perm], :128]\n"
633		"1:\n"
634			"sub     %[x], %[x], #16\n"
635			"sub     %[y], %[y], #16\n"
636			"sub     %[position], %[position], #8\n"
637			"vld2.16 {d4, d5}, [%[pcm]]!\n"
638			"vld2.16 {d20, d21}, [%[pcm]]!\n"
639			"vswp    d5, d20\n"
640			"vtbl.8  d16, {d4, d5}, d0\n"
641			"vtbl.8  d17, {d4, d5}, d1\n"
642			"vtbl.8  d18, {d20, d21}, d0\n"
643			"vtbl.8  d19, {d20, d21}, d1\n"
644			"vst1.16 {d16, d17}, [%[x], :128]\n"
645			"vst1.16 {d18, d19}, [%[y], :128]\n"
646			"subs    %[nsamples], %[nsamples], #8\n"
647			"bgt     1b\n"
648			:
649			  [x]        "+r" (x),
650			  [y]        "+r" (y),
651			  [pcm]      "+r" (pcm),
652			  [nsamples] "+r" (nsamples),
653			  [position] "+r" (position)
654			:
655			  [perm]      "r" (big_endian ? perm_be : perm_le)
656			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
657			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
658			  "d20", "d21", "d22", "d23");
659	} else {
660		int16_t *x = &X[0][position];
661		asm volatile (
662			"vld1.8  {d0, d1}, [%[perm], :128]\n"
663		"1:\n"
664			"sub     %[x], %[x], #16\n"
665			"sub     %[position], %[position], #8\n"
666			"vld1.8  {d4, d5}, [%[pcm]]!\n"
667			"vtbl.8  d16, {d4, d5}, d0\n"
668			"vtbl.8  d17, {d4, d5}, d1\n"
669			"vst1.16 {d16, d17}, [%[x], :128]\n"
670			"subs    %[nsamples], %[nsamples], #8\n"
671			"bgt     1b\n"
672			:
673			  [x]        "+r" (x),
674			  [pcm]      "+r" (pcm),
675			  [nsamples] "+r" (nsamples),
676			  [position] "+r" (position)
677			:
678			  [perm]      "r" (big_endian ? perm_be : perm_le)
679			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
680			  "d5", "d6", "d7", "d16", "d17", "d18", "d19");
681	}
682	return position;
683}
684
685static SBC_ALWAYS_INLINE int sbc_enc_process_input_8s_neon_internal(
686	int position,
687	const uint8_t *pcm, int16_t X[2][SBC_X_BUFFER_SIZE],
688	int nsamples, int nchannels, int big_endian)
689{
690	static SBC_ALIGNED uint8_t perm_be[4][8] = {
691		PERM_BE(15, 7, 14, 8),
692		PERM_BE(13, 9, 12, 10),
693		PERM_BE(11, 3, 6,  0),
694		PERM_BE(5,  1, 4,  2)
695	};
696	static SBC_ALIGNED uint8_t perm_le[4][8] = {
697		PERM_LE(15, 7, 14, 8),
698		PERM_LE(13, 9, 12, 10),
699		PERM_LE(11, 3, 6,  0),
700		PERM_LE(5,  1, 4,  2)
701	};
702	/* handle X buffer wraparound */
703	if (position < nsamples) {
704		int16_t *dst = &X[0][SBC_X_BUFFER_SIZE - 72];
705		int16_t *src = &X[0][position];
706		asm volatile (
707			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
708			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
709			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
710			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
711			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
712			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
713			"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
714			"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
715			"vld1.16 {d0, d1}, [%[src], :128]!\n"
716			"vst1.16 {d0, d1}, [%[dst], :128]!\n"
717			:
718			  [dst] "+r" (dst),
719			  [src] "+r" (src)
720			: : "memory", "d0", "d1", "d2", "d3");
721		if (nchannels > 1) {
722			dst = &X[1][SBC_X_BUFFER_SIZE - 72];
723			src = &X[1][position];
724			asm volatile (
725				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
726				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
727				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
728				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
729				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
730				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
731				"vld1.16 {d0, d1, d2, d3}, [%[src], :128]!\n"
732				"vst1.16 {d0, d1, d2, d3}, [%[dst], :128]!\n"
733				"vld1.16 {d0, d1}, [%[src], :128]!\n"
734				"vst1.16 {d0, d1}, [%[dst], :128]!\n"
735				:
736				  [dst] "+r" (dst),
737				  [src] "+r" (src)
738				: : "memory", "d0", "d1", "d2", "d3");
739		}
740		position = SBC_X_BUFFER_SIZE - 72;
741	}
742
743	if ((nchannels > 1) && ((uintptr_t)pcm & 1)) {
744		/* poor 'pcm' alignment */
745		int16_t *x = &X[0][position];
746		int16_t *y = &X[1][position];
747		asm volatile (
748			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
749		"1:\n"
750			"sub     %[x], %[x], #32\n"
751			"sub     %[y], %[y], #32\n"
752			"sub     %[position], %[position], #16\n"
753			"vld1.8  {d4, d5, d6, d7}, [%[pcm]]!\n"
754			"vuzp.16 q2,  q3\n"
755			"vld1.8  {d20, d21, d22, d23}, [%[pcm]]!\n"
756			"vuzp.16 q10, q11\n"
757			"vswp    q3,  q10\n"
758			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
759			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
760			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
761			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
762			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
763			"vtbl.8  d16, {d20, d21, d22, d23}, d0\n"
764			"vtbl.8  d17, {d20, d21, d22, d23}, d1\n"
765			"vtbl.8  d18, {d20, d21, d22, d23}, d2\n"
766			"vtbl.8  d19, {d20, d21, d22, d23}, d3\n"
767			"vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
768			"subs    %[nsamples], %[nsamples], #16\n"
769			"bgt     1b\n"
770			:
771			  [x]        "+r" (x),
772			  [y]        "+r" (y),
773			  [pcm]      "+r" (pcm),
774			  [nsamples] "+r" (nsamples),
775			  [position] "+r" (position)
776			:
777			  [perm]      "r" (big_endian ? perm_be : perm_le)
778			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
779			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
780			  "d20", "d21", "d22", "d23");
781	} else if (nchannels > 1) {
782		/* proper 'pcm' alignment */
783		int16_t *x = &X[0][position];
784		int16_t *y = &X[1][position];
785		asm volatile (
786			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
787		"1:\n"
788			"sub     %[x], %[x], #32\n"
789			"sub     %[y], %[y], #32\n"
790			"sub     %[position], %[position], #16\n"
791			"vld2.16  {d4, d5, d6, d7}, [%[pcm]]!\n"
792			"vld2.16  {d20, d21, d22, d23}, [%[pcm]]!\n"
793			"vswp    q3, q10\n"
794			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
795			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
796			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
797			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
798			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
799			"vtbl.8  d16, {d20, d21, d22, d23}, d0\n"
800			"vtbl.8  d17, {d20, d21, d22, d23}, d1\n"
801			"vtbl.8  d18, {d20, d21, d22, d23}, d2\n"
802			"vtbl.8  d19, {d20, d21, d22, d23}, d3\n"
803			"vst1.16 {d16, d17, d18, d19}, [%[y], :128]\n"
804			"subs    %[nsamples], %[nsamples], #16\n"
805			"bgt     1b\n"
806			:
807			  [x]        "+r" (x),
808			  [y]        "+r" (y),
809			  [pcm]      "+r" (pcm),
810			  [nsamples] "+r" (nsamples),
811			  [position] "+r" (position)
812			:
813			  [perm]      "r" (big_endian ? perm_be : perm_le)
814			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
815			  "d5", "d6", "d7", "d16", "d17", "d18", "d19",
816			  "d20", "d21", "d22", "d23");
817	} else {
818		int16_t *x = &X[0][position];
819		asm volatile (
820			"vld1.8  {d0, d1, d2, d3}, [%[perm], :128]\n"
821		"1:\n"
822			"sub     %[x], %[x], #32\n"
823			"sub     %[position], %[position], #16\n"
824			"vld1.8  {d4, d5, d6, d7}, [%[pcm]]!\n"
825			"vtbl.8  d16, {d4, d5, d6, d7}, d0\n"
826			"vtbl.8  d17, {d4, d5, d6, d7}, d1\n"
827			"vtbl.8  d18, {d4, d5, d6, d7}, d2\n"
828			"vtbl.8  d19, {d4, d5, d6, d7}, d3\n"
829			"vst1.16 {d16, d17, d18, d19}, [%[x], :128]\n"
830			"subs    %[nsamples], %[nsamples], #16\n"
831			"bgt     1b\n"
832			:
833			  [x]        "+r" (x),
834			  [pcm]      "+r" (pcm),
835			  [nsamples] "+r" (nsamples),
836			  [position] "+r" (position)
837			:
838			  [perm]      "r" (big_endian ? perm_be : perm_le)
839			: "cc", "memory", "d0", "d1", "d2", "d3", "d4",
840			  "d5", "d6", "d7", "d16", "d17", "d18", "d19");
841	}
842	return position;
843}
844
845#undef PERM_BE
846#undef PERM_LE
847
848static int sbc_enc_process_input_4s_be_neon(int position, const uint8_t *pcm,
849					int16_t X[2][SBC_X_BUFFER_SIZE],
850					int nsamples, int nchannels)
851{
852	return sbc_enc_process_input_4s_neon_internal(
853		position, pcm, X, nsamples, nchannels, 1);
854}
855
856static int sbc_enc_process_input_4s_le_neon(int position, const uint8_t *pcm,
857					int16_t X[2][SBC_X_BUFFER_SIZE],
858					int nsamples, int nchannels)
859{
860	return sbc_enc_process_input_4s_neon_internal(
861		position, pcm, X, nsamples, nchannels, 0);
862}
863
864static int sbc_enc_process_input_8s_be_neon(int position, const uint8_t *pcm,
865					int16_t X[2][SBC_X_BUFFER_SIZE],
866					int nsamples, int nchannels)
867{
868	return sbc_enc_process_input_8s_neon_internal(
869		position, pcm, X, nsamples, nchannels, 1);
870}
871
872static int sbc_enc_process_input_8s_le_neon(int position, const uint8_t *pcm,
873					int16_t X[2][SBC_X_BUFFER_SIZE],
874					int nsamples, int nchannels)
875{
876	return sbc_enc_process_input_8s_neon_internal(
877		position, pcm, X, nsamples, nchannels, 0);
878}
879
880void sbc_init_primitives_neon(struct sbc_encoder_state *state)
881{
882	state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
883	state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
884	state->sbc_calc_scalefactors = sbc_calc_scalefactors_neon;
885	state->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j_neon;
886	state->sbc_enc_process_input_4s_le = sbc_enc_process_input_4s_le_neon;
887	state->sbc_enc_process_input_4s_be = sbc_enc_process_input_4s_be_neon;
888	state->sbc_enc_process_input_8s_le = sbc_enc_process_input_8s_le_neon;
889	state->sbc_enc_process_input_8s_be = sbc_enc_process_input_8s_be_neon;
890	state->implementation_info = "NEON";
891}
892
893#endif
894