1/* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6#include "dsp_util.h"
7
8#ifndef max
9#define max(a, b) ({ __typeof__(a) _a = (a);	\
10			__typeof__(b) _b = (b);	\
11			_a > _b ? _a : _b; })
12#endif
13
14#ifndef min
15#define min(a, b) ({ __typeof__(a) _a = (a);	\
16			__typeof__(b) _b = (b);	\
17			_a < _b ? _a : _b; })
18#endif
19
20#undef deinterleave_stereo
21#undef interleave_stereo
22
23#ifdef __ARM_NEON__
24#include <arm_neon.h>
25
26static void deinterleave_stereo(int16_t *input, float *output1,
27				float *output2, int frames)
28{
29	/* Process 8 frames (16 samples) each loop. */
30	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
31	int chunk = frames >> 3;
32	frames &= 7;
33	if (chunk) {
34		__asm__ __volatile__ (
35			"1:					    \n"
36			"vld2.16 {d0-d3}, [%[input]]!		    \n"
37			"subs %[chunk], #1			    \n"
38			"vmovl.s16 q3, d3			    \n"
39			"vmovl.s16 q2, d2			    \n"
40			"vmovl.s16 q1, d1			    \n"
41			"vmovl.s16 q0, d0			    \n"
42			"vcvt.f32.s32 q3, q3, #15		    \n"
43			"vcvt.f32.s32 q2, q2, #15		    \n"
44			"vcvt.f32.s32 q1, q1, #15		    \n"
45			"vcvt.f32.s32 q0, q0, #15		    \n"
46			"vst1.32 {d4-d7}, [%[output2]]!		    \n"
47			"vst1.32 {d0-d3}, [%[output1]]!		    \n"
48			"bne 1b					    \n"
49			: /* output */
50			  [chunk]"+r"(chunk),
51			  [input]"+r"(input),
52			  [output1]"+r"(output1),
53			  [output2]"+r"(output2)
54			: /* input */
55			: /* clobber */
56			  "q0", "q1", "q2", "q3", "memory", "cc"
57			);
58	}
59
60	/* The remaining samples. */
61	while (frames--) {
62		*output1++ = *input++ / 32768.0f;
63		*output2++ = *input++ / 32768.0f;
64	}
65}
66#define deinterleave_stereo deinterleave_stereo
67
68static void interleave_stereo(float *input1, float *input2,
69			      int16_t *output, int frames)
70{
71	/* Process 4 frames (8 samples) each loop. */
72	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
73	float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
74	float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
75	int chunk = frames >> 2;
76	frames &= 3;
77
78	if (chunk) {
79		__asm__ __volatile__ (
80			"veor q0, q0, q0			    \n"
81			"1:					    \n"
82			"vld1.32 {d2-d3}, [%[input1]]!		    \n"
83			"vld1.32 {d4-d5}, [%[input2]]!		    \n"
84			"subs %[chunk], #1			    \n"
85			/* We try to round to the nearest number by adding 0.5
86			 * to positive input, and adding -0.5 to the negative
87			 * input, then truncate.
88			 */
89			"vcgt.f32 q3, q1, q0			    \n"
90			"vcgt.f32 q4, q2, q0			    \n"
91			"vbsl q3, %q[pos], %q[neg]		    \n"
92			"vbsl q4, %q[pos], %q[neg]		    \n"
93			"vadd.f32 q1, q1, q3			    \n"
94			"vadd.f32 q2, q2, q4			    \n"
95			"vcvt.s32.f32 q1, q1, #15		    \n"
96			"vcvt.s32.f32 q2, q2, #15		    \n"
97			"vqmovn.s32 d2, q1			    \n"
98			"vqmovn.s32 d3, q2			    \n"
99			"vst2.16 {d2-d3}, [%[output]]!		    \n"
100			"bne 1b					    \n"
101			: /* output */
102			  "=r"(chunk),
103			  "=r"(input1),
104			  "=r"(input2),
105			  "=r"(output)
106			: /* input */
107			  [chunk]"0"(chunk),
108			  [input1]"1"(input1),
109			  [input2]"2"(input2),
110			  [output]"3"(output),
111			  [pos]"w"(pos),
112			  [neg]"w"(neg)
113			: /* clobber */
114			  "q0", "q1", "q2", "q3", "q4", "memory", "cc"
115			);
116	}
117
118	/* The remaining samples */
119	while (frames--) {
120		float f;
121		f = *input1++;
122		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
123		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
124		f = *input2++;
125		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
126		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
127	}
128}
129#define interleave_stereo interleave_stereo
130
131#endif
132
133#ifdef __SSE3__
134#include <emmintrin.h>
135
136static void deinterleave_stereo(int16_t *input, float *output1,
137				float *output2, int frames)
138{
139	/* Process 8 frames (16 samples) each loop. */
140	/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
141	int chunk = frames >> 3;
142	frames &= 7;
143	if (chunk) {
144		__asm__ __volatile__ (
145			"1:                                         \n"
146			"lddqu (%[input]), %%xmm0                   \n"
147			"lddqu 16(%[input]), %%xmm1                 \n"
148			"add $32, %[input]                          \n"
149			"movdqa %%xmm0, %%xmm2                      \n"
150			"movdqa %%xmm1, %%xmm3                      \n"
151			"pslld $16, %%xmm0                          \n"
152			"pslld $16, %%xmm1                          \n"
153			"psrad $16, %%xmm2                          \n"
154			"psrad $16, %%xmm3                          \n"
155			"cvtdq2ps %%xmm0, %%xmm0                    \n"
156			"cvtdq2ps %%xmm1, %%xmm1                    \n"
157			"cvtdq2ps %%xmm2, %%xmm2                    \n"
158			"cvtdq2ps %%xmm3, %%xmm3                    \n"
159			"mulps %[scale_2_n31], %%xmm0               \n"
160			"mulps %[scale_2_n31], %%xmm1               \n"
161			"mulps %[scale_2_n15], %%xmm2               \n"
162			"mulps %[scale_2_n15], %%xmm3               \n"
163			"movdqu %%xmm0, (%[output1])                \n"
164			"movdqu %%xmm1, 16(%[output1])              \n"
165			"movdqu %%xmm2, (%[output2])                \n"
166			"movdqu %%xmm3, 16(%[output2])              \n"
167			"add $32, %[output1]                        \n"
168			"add $32, %[output2]                        \n"
169			"sub $1, %[chunk]                           \n"
170			"jnz 1b                                     \n"
171			: /* output */
172			  [chunk]"+r"(chunk),
173			  [input]"+r"(input),
174			  [output1]"+r"(output1),
175			  [output2]"+r"(output2)
176			: /* input */
177			  [scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
178			  [scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
179			: /* clobber */
180			  "xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
181			);
182	}
183
184	/* The remaining samples. */
185	while (frames--) {
186		*output1++ = *input++ / 32768.0f;
187		*output2++ = *input++ / 32768.0f;
188	}
189}
190#define deinterleave_stereo deinterleave_stereo
191
192static void interleave_stereo(float *input1, float *input2,
193			      int16_t *output, int frames)
194{
195	/* Process 4 frames (8 samples) each loop. */
196	/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
197	int chunk = frames >> 2;
198	frames &= 3;
199
200	if (chunk) {
201		__asm__ __volatile__ (
202			"1:                                         \n"
203			"lddqu (%[input1]), %%xmm0                  \n"
204			"lddqu (%[input2]), %%xmm2                  \n"
205			"movaps %%xmm0, %%xmm1                      \n"
206			"unpcklps %%xmm2, %%xmm0                    \n"
207			"unpckhps %%xmm2, %%xmm1                    \n"
208			"add $16, %[input1]                         \n"
209			"add $16, %[input2]                         \n"
210			"mulps %[scale_2_15], %%xmm0                \n"
211			"mulps %[scale_2_15], %%xmm1                \n"
212			"cvtps2dq %%xmm0, %%xmm0                    \n"
213			"cvtps2dq %%xmm1, %%xmm1                    \n"
214			"packssdw %%xmm1, %%xmm0                    \n"
215			"movdqu %%xmm0, (%[output])                 \n"
216			"add $16, %[output]                         \n"
217			"sub $1, %[chunk]                           \n"
218			"jnz 1b                                     \n"
219			: /* output */
220			  "=r"(chunk),
221			  "=r"(input1),
222			  "=r"(input2),
223			  "=r"(output)
224			: /* input */
225			  [chunk]"0"(chunk),
226			  [input1]"1"(input1),
227			  [input2]"2"(input2),
228			  [output]"3"(output),
229			  [scale_2_15]"x"(_mm_set1_ps(1.0f*(1<<15)))
230			: /* clobber */
231			  "xmm0", "xmm1", "xmm2", "memory", "cc"
232			);
233	}
234
235	/* The remaining samples */
236	while (frames--) {
237		float f;
238		f = *input1++;
239		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
240		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
241		f = *input2++;
242		f += (f > 0) ? (0.5f / 32768.0f) : (-0.5f / 32768.0f);
243		*output++ = max(-32768, min(32767, (int)(f * 32768.0f)));
244	}
245}
246#define interleave_stereo interleave_stereo
247
248#endif
249
250void dsp_util_deinterleave(int16_t *input, float *const *output, int channels,
251			   int frames)
252{
253	float *output_ptr[channels];
254	int i, j;
255
256#ifdef deinterleave_stereo
257	if (channels == 2) {
258		deinterleave_stereo(input, output[0], output[1], frames);
259		return;
260	}
261#endif
262
263	for (i = 0; i < channels; i++)
264		output_ptr[i] = output[i];
265
266	for (i = 0; i < frames; i++)
267		for (j = 0; j < channels; j++)
268			*(output_ptr[j]++) = *input++ / 32768.0f;
269}
270
271void dsp_util_interleave(float *const *input, int16_t *output, int channels,
272			 int frames)
273{
274	float *input_ptr[channels];
275	int i, j;
276
277#ifdef interleave_stereo
278	if (channels == 2) {
279		interleave_stereo(input[0], input[1], output, frames);
280		return;
281	}
282#endif
283
284	for (i = 0; i < channels; i++)
285		input_ptr[i] = input[i];
286
287	for (i = 0; i < frames; i++)
288		for (j = 0; j < channels; j++) {
289			int16_t i16;
290			float f = *(input_ptr[j]++) * 32768.0f;
291			if (f > 32767)
292				i16 = 32767;
293			else if (f < -32768)
294				i16 = -32768;
295			else
296				i16 = (int16_t) (f > 0 ? f + 0.5f : f - 0.5f);
297			*output++ = i16;
298		}
299}
300
301void dsp_enable_flush_denormal_to_zero()
302{
303#if defined(__i386__) || defined(__x86_64__)
304	unsigned int mxcsr;
305	mxcsr = __builtin_ia32_stmxcsr();
306	__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
307#elif defined(__arm__)
308	int cw;
309	__asm__ __volatile__ ("mrc p10, 7, %0, cr1, cr0, 0" : "=r" (cw));
310	__asm__ __volatile__ ("mcr p10, 7, %0, cr1, cr0, 0" : : "r" (cw | (1 << 24)));
311#else
312#warning "Don't know how to disable denorms. Performace may suffer."
313#endif
314}
315