1/* Copyright 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#include "dsp_util.h"
7
8#ifndef max
9#define max(a, b) ({ __typeof__(a) _a = (a);	\
10			__typeof__(b) _b = (b);	\
11			_a > _b ? _a : _b; })
12#endif
13
14#ifndef min
15#define min(a, b) ({ __typeof__(a) _a = (a);	\
16			__typeof__(b) _b = (b);	\
17			_a < _b ? _a : _b; })
18#endif
19
20#undef deinterleave_stereo
21#undef interleave_stereo
22
23/* Converts shorts in range of -32768 to 32767 to floats in range of
24 * -1.0f to 1.0f.
25 * scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
26 * shorts to int with sign extension.
27 */
28#ifdef __aarch64__
29static void deinterleave_stereo(int16_t *input, float *output1,
30				float *output2, int frames)
31{
32	int chunk = frames >> 3;
33	frames &= 7;
34	/* Process 8 frames (16 samples) each loop. */
35	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
36	if (chunk) {
37		__asm__ __volatile__ (
38			"1:                                         \n"
39			"ld2  {v2.8h, v3.8h}, [%[input]], #32       \n"
40			"subs %w[chunk], %w[chunk], #1              \n"
41			"sxtl   v0.4s, v2.4h                        \n"
42			"sxtl2  v1.4s, v2.8h                        \n"
43			"sxtl   v2.4s, v3.4h                        \n"
44			"sxtl2  v3.4s, v3.8h                        \n"
45			"scvtf  v0.4s, v0.4s, #15                   \n"
46			"scvtf  v1.4s, v1.4s, #15                   \n"
47			"scvtf  v2.4s, v2.4s, #15                   \n"
48			"scvtf  v3.4s, v3.4s, #15                   \n"
49			"st1    {v0.4s, v1.4s}, [%[output1]], #32   \n"
50			"st1    {v2.4s, v3.4s}, [%[output2]], #32   \n"
51			"b.ne   1b                                  \n"
52			: /* output */
53			  [chunk]"+r"(chunk),
54			  [input]"+r"(input),
55			  [output1]"+r"(output1),
56			  [output2]"+r"(output2)
57			: /* input */
58			: /* clobber */
59			  "v0", "v1", "v2", "v3", "memory", "cc"
60			);
61	}
62
63	/* The remaining samples. */
64	while (frames--) {
65		*output1++ = *input++ / 32768.0f;
66		*output2++ = *input++ / 32768.0f;
67	}
68}
69#define deinterleave_stereo deinterleave_stereo
70
71/* Converts floats in range of -1.0f to 1.0f to shorts in range of
72 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
73 * from zero.
74 * Rounding is achieved by using fcvtas instruction. (a = away)
75 * The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
76 * Add to exponent is equivalent to multiply for exponent range of 0 to 239,
77 * which is 2.59 * 10^33.  A signed saturating add (sqadd) limits exponents
78 * from 240 to 255 to clamp to 255.
79 * For very large values, beyond +/- 2 billion, fcvtas will clamp the result
80 * to the min or max value that fits an int.
81 * For other values, sqxtn clamps the output to -32768 to 32767 range.
82 */
83static void interleave_stereo(float *input1, float *input2,
84			      int16_t *output, int frames)
85{
86	/* Process 4 frames (8 samples) each loop. */
87	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
88	int chunk = frames >> 2;
89	frames &= 3;
90
91	if (chunk) {
92		__asm__ __volatile__ (
93			"dup    v2.4s, %w[scale]                    \n"
94			"1:                                         \n"
95			"ld1    {v0.4s}, [%[input1]], #16           \n"
96			"ld1    {v1.4s}, [%[input2]], #16           \n"
97			"subs   %w[chunk], %w[chunk], #1            \n"
98			"sqadd  v0.4s, v0.4s, v2.4s                 \n"
99			"sqadd  v1.4s, v1.4s, v2.4s                 \n"
100			"fcvtas v0.4s, v0.4s                        \n"
101			"fcvtas v1.4s, v1.4s                        \n"
102			"sqxtn  v0.4h, v0.4s                        \n"
103			"sqxtn  v1.4h, v1.4s                        \n"
104			"st2    {v0.4h, v1.4h}, [%[output]], #16    \n"
105			"b.ne   1b                                  \n"
106			: /* output */
107			  [chunk]"+r"(chunk),
108			  [input1]"+r"(input1),
109			  [input2]"+r"(input2),
110			  [output]"+r"(output)
111			: /* input */
112			  [scale]"r"(15 << 23)
113			: /* clobber */
114			  "v0", "v1", "v2", "memory", "cc"
115			);
116	}
117
118	/* The remaining samples */
119	while (frames--) {
120		float f;
121		f = *input1++ * 32768.0f;
122		f += (f >= 0) ? 0.5f : -0.5f;
123		*output++ = max(-32768, min(32767, (int)(f)));
124		f = *input2++ * 32768.0f;
125		f += (f >= 0) ? 0.5f : -0.5f;
126		*output++ = max(-32768, min(32767, (int)(f)));
127	}
128}
129#define interleave_stereo interleave_stereo
130#endif
131
132#ifdef __ARM_NEON__
133#include <arm_neon.h>
134
135static void deinterleave_stereo(int16_t *input, float *output1,
136				float *output2, int frames)
137{
138	/* Process 8 frames (16 samples) each loop. */
139	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
140	int chunk = frames >> 3;
141	frames &= 7;
142	if (chunk) {
143		__asm__ __volatile__ (
144			"1:					    \n"
145			"vld2.16 {d0-d3}, [%[input]]!		    \n"
146			"subs %[chunk], #1			    \n"
147			"vmovl.s16 q3, d3			    \n"
148			"vmovl.s16 q2, d2			    \n"
149			"vmovl.s16 q1, d1			    \n"
150			"vmovl.s16 q0, d0			    \n"
151			"vcvt.f32.s32 q3, q3, #15		    \n"
152			"vcvt.f32.s32 q2, q2, #15		    \n"
153			"vcvt.f32.s32 q1, q1, #15		    \n"
154			"vcvt.f32.s32 q0, q0, #15		    \n"
155			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
156			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
157			"bne 1b					    \n"
158			: /* output */
159			  [chunk]"+r"(chunk),
160			  [input]"+r"(input),
161			  [output1]"+r"(output1),
162			  [output2]"+r"(output2)
163			: /* input */
164			: /* clobber */
165			  "q0", "q1", "q2", "q3", "memory", "cc"
166			);
167	}
168
169	/* The remaining samples. */
170	while (frames--) {
171		*output1++ = *input++ / 32768.0f;
172		*output2++ = *input++ / 32768.0f;
173	}
174}
175#define deinterleave_stereo deinterleave_stereo
176
177/* Converts floats in range of -1.0f to 1.0f to shorts in range of
178 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
179 * from zero.
180 * Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
181 * precision, and then converting float to fixed point using vcvt instruction
182 * which truncated toward zero.
183 * For very large values, beyond +/- 2 billion, vcvt will clamp the result
184 * to the min or max value that fits an int.
185 * For other values, vqmovn clamps the output to -32768 to 32767 range.
186 */
187static void interleave_stereo(float *input1, float *input2,
188			      int16_t *output, int frames)
189{
190	/* Process 4 frames (8 samples) each loop. */
191	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
192	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
193	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
194	int chunk = frames >> 2;
195	frames &= 3;
196
197	if (chunk) {
198		__asm__ __volatile__ (
199			"veor q0, q0, q0			    \n"
200			"1:					    \n"
201			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
202			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
203			"subs %[chunk], #1			    \n"
204			/* We try to round to the nearest number by adding 0.5
205			 * to positive input, and adding -0.5 to the negative
206			 * input, then truncate.
207			 */
208			"vcgt.f32 q3, q1, q0			    \n"
209			"vcgt.f32 q4, q2, q0			    \n"
210			"vbsl q3, %q[pos], %q[neg]		    \n"
211			"vbsl q4, %q[pos], %q[neg]		    \n"
212			"vadd.f32 q1, q1, q3			    \n"
213			"vadd.f32 q2, q2, q4			    \n"
214			"vcvt.s32.f32 q1, q1, #15		    \n"
215			"vcvt.s32.f32 q2, q2, #15		    \n"
216			"vqmovn.s32 d2, q1			    \n"
217			"vqmovn.s32 d3, q2			    \n"
218			"vst2.16 {d2-d3}, [%[output]]!		    \n"
219			"bne 1b					    \n"
220			: /* output */
221			  [chunk]"+r"(chunk),
222			  [input1]"+r"(input1),
223			  [input2]"+r"(input2),
224			  [output]"+r"(output)
225			: /* input */
226			  [pos]"w"(pos),
227			  [neg]"w"(neg)
228			: /* clobber */
229			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
230			);
231	}
232
233	/* The remaining samples */
234	while (frames--) {
235		float f;
236		f = *input1++ * 32768.0f;
237		f += (f >= 0) ? 0.5f : -0.5f;
238		*output++ = max(-32768, min(32767, (int)(f)));
239		f = *input2++ * 32768.0f;
240		f += (f >= 0) ? 0.5f : -0.5f;
241		*output++ = max(-32768, min(32767, (int)(f)));
242	}
243}
244#define interleave_stereo interleave_stereo
245#endif
246
247#ifdef __SSE3__
248#include <emmintrin.h>
249
250/* Converts shorts in range of -32768 to 32767 to floats in range of
251 * -1.0f to 1.0f.
252 * pslld and psrad shifts are used to isolate the low and high word, but
253 * each in a different range:
254 * The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
255 * The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
256 * cvtdq2ps converts ints to floats as is.
257 * mulps is used to normalize the range of the low and high words, adjusting
258 * for high and low words being in different range.
259 */
260static void deinterleave_stereo(int16_t *input, float *output1,
261				float *output2, int frames)
262{
263	/* Process 8 frames (16 samples) each loop. */
264	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
265	int chunk = frames >> 3;
266	frames &= 7;
267	if (chunk) {
268		__asm__ __volatile__ (
269			"1:                                         \n"
270			"lddqu (%[input]), %%xmm0                   \n"
271			"lddqu 16(%[input]), %%xmm1                 \n"
272			"add $32, %[input]                          \n"
273			"movdqa %%xmm0, %%xmm2                      \n"
274			"movdqa %%xmm1, %%xmm3                      \n"
275			"pslld $16, %%xmm0                          \n"
276			"pslld $16, %%xmm1                          \n"
277			"psrad $16, %%xmm2                          \n"
278			"psrad $16, %%xmm3                          \n"
279			"cvtdq2ps %%xmm0, %%xmm0                    \n"
280			"cvtdq2ps %%xmm1, %%xmm1                    \n"
281			"cvtdq2ps %%xmm2, %%xmm2                    \n"
282			"cvtdq2ps %%xmm3, %%xmm3                    \n"
283			"mulps %[scale_2_n31], %%xmm0               \n"
284			"mulps %[scale_2_n31], %%xmm1               \n"
285			"mulps %[scale_2_n15], %%xmm2               \n"
286			"mulps %[scale_2_n15], %%xmm3               \n"
287			"movdqu %%xmm0, (%[output1])                \n"
288			"movdqu %%xmm1, 16(%[output1])              \n"
289			"movdqu %%xmm2, (%[output2])                \n"
290			"movdqu %%xmm3, 16(%[output2])              \n"
291			"add $32, %[output1]                        \n"
292			"add $32, %[output2]                        \n"
293			"sub $1, %[chunk]                           \n"
294			"jnz 1b                                     \n"
295			: /* output */
296			  [chunk]"+r"(chunk),
297			  [input]"+r"(input),
298			  [output1]"+r"(output1),
299			  [output2]"+r"(output2)
300			: /* input */
301			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
302			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
303			: /* clobber */
304			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
305			);
306	}
307
308	/* The remaining samples. */
309	while (frames--) {
310		*output1++ = *input++ / 32768.0f;
311		*output2++ = *input++ / 32768.0f;
312	}
313}
314#define deinterleave_stereo deinterleave_stereo
315
316/* Converts floats in range of -1.0f to 1.0f to shorts in range of
317 * -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
318 * even.
319 * For very large values, beyond +/- 2 billion, cvtps2dq will produce
320 * 0x80000000 and packssdw will clamp -32768.
321 */
322static void interleave_stereo(float *input1, float *input2,
323			      int16_t *output, int frames)
324{
325	/* Process 4 frames (8 samples) each loop. */
326	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
327	int chunk = frames >> 2;
328	frames &= 3;
329
330	if (chunk) {
331		__asm__ __volatile__ (
332			"1:                                         \n"
333			"lddqu (%[input1]), %%xmm0                  \n"
334			"lddqu (%[input2]), %%xmm2                  \n"
335			"add $16, %[input1]                         \n"
336			"add $16, %[input2]                         \n"
337			"movaps %%xmm0, %%xmm1                      \n"
338			"unpcklps %%xmm2, %%xmm0                    \n"
339			"unpckhps %%xmm2, %%xmm1                    \n"
340			"paddsw %[scale_2_15], %%xmm0               \n"
341			"paddsw %[scale_2_15], %%xmm1               \n"
342			"cvtps2dq %%xmm0, %%xmm0                    \n"
343			"cvtps2dq %%xmm1, %%xmm1                    \n"
344			"packssdw %%xmm1, %%xmm0                    \n"
345			"movdqu %%xmm0, (%[output])                 \n"
346			"add $16, %[output]                         \n"
347			"sub $1, %[chunk]                           \n"
348			"jnz 1b                                     \n"
349			: /* output */
350			  [chunk]"+r"(chunk),
351			  [input1]"+r"(input1),
352			  [input2]"+r"(input2),
353			  [output]"+r"(output)
354			: /* input */
355			  [scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
356			  [clamp_large]"x"(_mm_set1_ps(32767.0f))
357			: /* clobber */
358			  "xmm0", "xmm1", "xmm2", "memory", "cc"
359			);
360	}
361
362	/* The remaining samples */
363	while (frames--) {
364		float f;
365		f = *input1++ * 32768.0f;
366		f += (f >= 0) ? 0.5f : -0.5f;
367		*output++ = max(-32768, min(32767, (int)(f)));
368		f = *input2++ * 32768.0f;
369		f += (f >= 0) ? 0.5f : -0.5f;
370		*output++ = max(-32768, min(32767, (int)(f)));
371	}
372}
373#define interleave_stereo interleave_stereo
374#endif
375
376void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
377			   int frames)
378{
379	float *output_ptr[channels];
380	int i, j;
381
382#ifdef deinterleave_stereo
383	if (channels == 2) {
384		deinterleave_stereo(input, output[0], output[1], frames);
385		return;
386	}
387#endif
388
389	for (i = 0; i < channels; i++)
390		output_ptr[i] = output[i];
391
392	for (i = 0; i < frames; i++)
393		for (j = 0; j < channels; j++)
394			*(output_ptr[j]++) = *input++ / 32768.0f;
395}
396
397void dsp_util_interleave(float *const *input, int16_t *output, int channels,
398			 int frames)
399{
400	float *input_ptr[channels];
401	int i, j;
402
403#ifdef interleave_stereo
404	if (channels == 2) {
405		interleave_stereo(input[0], input[1], output, frames);
406		return;
407	}
408#endif
409
410	for (i = 0; i < channels; i++)
411		input_ptr[i] = input[i];
412
413	for (i = 0; i < frames; i++)
414		for (j = 0; j < channels; j++) {
415			float f = *(input_ptr[j]++) * 32768.0f;
416			f += (f >= 0) ? 0.5f : -0.5f;
417			*output++ = max(-32768, min(32767, (int)(f)));
418		}
419}
420
421void dsp_enable_flush_denormal_to_zero()
422{
423#if defined(__i386__) || defined(__x86_64__)
424	unsigned int mxcsr;
425	mxcsr = __builtin_ia32_stmxcsr();
426	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
427#elif defined(__aarch64__)
428	uint64_t cw;
429	__asm__ __volatile__ (
430		"mrs    %0, fpcr			    \n"
431		"orr    %0, %0, #0x1000000		    \n"
432		"msr    fpcr, %0			    \n"
433		"isb					    \n"
434		: "=r"(cw) :: "memory");
435#elif defined(__arm__)
436	uint32_t cw;
437	__asm__ __volatile__ (
438		"vmrs   %0, fpscr			    \n"
439		"orr    %0, %0, #0x1000000		    \n"
440		"vmsr   fpscr, %0			    \n"
441		: "=r"(cw) :: "memory");
442#else
443#warning "Don't know how to disable denorms. Performace may suffer."
444#endif
445}
446