1/*
2 *
3 *  Bluetooth low-complexity, subband codec (SBC) library
4 *
5 *  Copyright (C) 2008-2010  Nokia Corporation
6 *  Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
7 *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
8 *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
9 *
10 *
11 *  This library is free software; you can redistribute it and/or
12 *  modify it under the terms of the GNU Lesser General Public
13 *  License as published by the Free Software Foundation; either
14 *  version 2.1 of the License, or (at your option) any later version.
15 *
16 *  This library is distributed in the hope that it will be useful,
17 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 *  Lesser General Public License for more details.
20 *
21 *  You should have received a copy of the GNU Lesser General Public
22 *  License along with this library; if not, write to the Free Software
23 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
24 *
25 */
26
27#include <stdint.h>
28#include <limits.h>
29#include "sbc.h"
30#include "sbc_math.h"
31#include "sbc_tables.h"
32
33#include "sbc_primitives_mmx.h"
34
35/*
36 * MMX optimizations
37 */
38
39#ifdef SBC_BUILD_WITH_MMX_SUPPORT
40
41static inline void sbc_analyze_four_mmx(const int16_t *in, int32_t *out,
42					const FIXED_T *consts)
43{
44	static const SBC_ALIGNED int32_t round_c[2] = {
45		1 << (SBC_PROTO_FIXED4_SCALE - 1),
46		1 << (SBC_PROTO_FIXED4_SCALE - 1),
47	};
48	asm volatile (
49		"movq        (%0), %%mm0\n"
50		"movq       8(%0), %%mm1\n"
51		"pmaddwd     (%1), %%mm0\n"
52		"pmaddwd    8(%1), %%mm1\n"
53		"paddd       (%2), %%mm0\n"
54		"paddd       (%2), %%mm1\n"
55		"\n"
56		"movq      16(%0), %%mm2\n"
57		"movq      24(%0), %%mm3\n"
58		"pmaddwd   16(%1), %%mm2\n"
59		"pmaddwd   24(%1), %%mm3\n"
60		"paddd      %%mm2, %%mm0\n"
61		"paddd      %%mm3, %%mm1\n"
62		"\n"
63		"movq      32(%0), %%mm2\n"
64		"movq      40(%0), %%mm3\n"
65		"pmaddwd   32(%1), %%mm2\n"
66		"pmaddwd   40(%1), %%mm3\n"
67		"paddd      %%mm2, %%mm0\n"
68		"paddd      %%mm3, %%mm1\n"
69		"\n"
70		"movq      48(%0), %%mm2\n"
71		"movq      56(%0), %%mm3\n"
72		"pmaddwd   48(%1), %%mm2\n"
73		"pmaddwd   56(%1), %%mm3\n"
74		"paddd      %%mm2, %%mm0\n"
75		"paddd      %%mm3, %%mm1\n"
76		"\n"
77		"movq      64(%0), %%mm2\n"
78		"movq      72(%0), %%mm3\n"
79		"pmaddwd   64(%1), %%mm2\n"
80		"pmaddwd   72(%1), %%mm3\n"
81		"paddd      %%mm2, %%mm0\n"
82		"paddd      %%mm3, %%mm1\n"
83		"\n"
84		"psrad         %4, %%mm0\n"
85		"psrad         %4, %%mm1\n"
86		"packssdw   %%mm0, %%mm0\n"
87		"packssdw   %%mm1, %%mm1\n"
88		"\n"
89		"movq       %%mm0, %%mm2\n"
90		"pmaddwd   80(%1), %%mm0\n"
91		"pmaddwd   88(%1), %%mm2\n"
92		"\n"
93		"movq       %%mm1, %%mm3\n"
94		"pmaddwd   96(%1), %%mm1\n"
95		"pmaddwd  104(%1), %%mm3\n"
96		"paddd      %%mm1, %%mm0\n"
97		"paddd      %%mm3, %%mm2\n"
98		"\n"
99		"movq       %%mm0, (%3)\n"
100		"movq       %%mm2, 8(%3)\n"
101		:
102		: "r" (in), "r" (consts), "r" (&round_c), "r" (out),
103			"i" (SBC_PROTO_FIXED4_SCALE)
104		: "cc", "memory");
105}
106
107static inline void sbc_analyze_eight_mmx(const int16_t *in, int32_t *out,
108							const FIXED_T *consts)
109{
110	static const SBC_ALIGNED int32_t round_c[2] = {
111		1 << (SBC_PROTO_FIXED8_SCALE - 1),
112		1 << (SBC_PROTO_FIXED8_SCALE - 1),
113	};
114	asm volatile (
115		"movq        (%0), %%mm0\n"
116		"movq       8(%0), %%mm1\n"
117		"movq      16(%0), %%mm2\n"
118		"movq      24(%0), %%mm3\n"
119		"pmaddwd     (%1), %%mm0\n"
120		"pmaddwd    8(%1), %%mm1\n"
121		"pmaddwd   16(%1), %%mm2\n"
122		"pmaddwd   24(%1), %%mm3\n"
123		"paddd       (%2), %%mm0\n"
124		"paddd       (%2), %%mm1\n"
125		"paddd       (%2), %%mm2\n"
126		"paddd       (%2), %%mm3\n"
127		"\n"
128		"movq      32(%0), %%mm4\n"
129		"movq      40(%0), %%mm5\n"
130		"movq      48(%0), %%mm6\n"
131		"movq      56(%0), %%mm7\n"
132		"pmaddwd   32(%1), %%mm4\n"
133		"pmaddwd   40(%1), %%mm5\n"
134		"pmaddwd   48(%1), %%mm6\n"
135		"pmaddwd   56(%1), %%mm7\n"
136		"paddd      %%mm4, %%mm0\n"
137		"paddd      %%mm5, %%mm1\n"
138		"paddd      %%mm6, %%mm2\n"
139		"paddd      %%mm7, %%mm3\n"
140		"\n"
141		"movq      64(%0), %%mm4\n"
142		"movq      72(%0), %%mm5\n"
143		"movq      80(%0), %%mm6\n"
144		"movq      88(%0), %%mm7\n"
145		"pmaddwd   64(%1), %%mm4\n"
146		"pmaddwd   72(%1), %%mm5\n"
147		"pmaddwd   80(%1), %%mm6\n"
148		"pmaddwd   88(%1), %%mm7\n"
149		"paddd      %%mm4, %%mm0\n"
150		"paddd      %%mm5, %%mm1\n"
151		"paddd      %%mm6, %%mm2\n"
152		"paddd      %%mm7, %%mm3\n"
153		"\n"
154		"movq      96(%0), %%mm4\n"
155		"movq     104(%0), %%mm5\n"
156		"movq     112(%0), %%mm6\n"
157		"movq     120(%0), %%mm7\n"
158		"pmaddwd   96(%1), %%mm4\n"
159		"pmaddwd  104(%1), %%mm5\n"
160		"pmaddwd  112(%1), %%mm6\n"
161		"pmaddwd  120(%1), %%mm7\n"
162		"paddd      %%mm4, %%mm0\n"
163		"paddd      %%mm5, %%mm1\n"
164		"paddd      %%mm6, %%mm2\n"
165		"paddd      %%mm7, %%mm3\n"
166		"\n"
167		"movq     128(%0), %%mm4\n"
168		"movq     136(%0), %%mm5\n"
169		"movq     144(%0), %%mm6\n"
170		"movq     152(%0), %%mm7\n"
171		"pmaddwd  128(%1), %%mm4\n"
172		"pmaddwd  136(%1), %%mm5\n"
173		"pmaddwd  144(%1), %%mm6\n"
174		"pmaddwd  152(%1), %%mm7\n"
175		"paddd      %%mm4, %%mm0\n"
176		"paddd      %%mm5, %%mm1\n"
177		"paddd      %%mm6, %%mm2\n"
178		"paddd      %%mm7, %%mm3\n"
179		"\n"
180		"psrad         %4, %%mm0\n"
181		"psrad         %4, %%mm1\n"
182		"psrad         %4, %%mm2\n"
183		"psrad         %4, %%mm3\n"
184		"\n"
185		"packssdw   %%mm0, %%mm0\n"
186		"packssdw   %%mm1, %%mm1\n"
187		"packssdw   %%mm2, %%mm2\n"
188		"packssdw   %%mm3, %%mm3\n"
189		"\n"
190		"movq       %%mm0, %%mm4\n"
191		"movq       %%mm0, %%mm5\n"
192		"pmaddwd  160(%1), %%mm4\n"
193		"pmaddwd  168(%1), %%mm5\n"
194		"\n"
195		"movq       %%mm1, %%mm6\n"
196		"movq       %%mm1, %%mm7\n"
197		"pmaddwd  192(%1), %%mm6\n"
198		"pmaddwd  200(%1), %%mm7\n"
199		"paddd      %%mm6, %%mm4\n"
200		"paddd      %%mm7, %%mm5\n"
201		"\n"
202		"movq       %%mm2, %%mm6\n"
203		"movq       %%mm2, %%mm7\n"
204		"pmaddwd  224(%1), %%mm6\n"
205		"pmaddwd  232(%1), %%mm7\n"
206		"paddd      %%mm6, %%mm4\n"
207		"paddd      %%mm7, %%mm5\n"
208		"\n"
209		"movq       %%mm3, %%mm6\n"
210		"movq       %%mm3, %%mm7\n"
211		"pmaddwd  256(%1), %%mm6\n"
212		"pmaddwd  264(%1), %%mm7\n"
213		"paddd      %%mm6, %%mm4\n"
214		"paddd      %%mm7, %%mm5\n"
215		"\n"
216		"movq       %%mm4, (%3)\n"
217		"movq       %%mm5, 8(%3)\n"
218		"\n"
219		"movq       %%mm0, %%mm5\n"
220		"pmaddwd  176(%1), %%mm0\n"
221		"pmaddwd  184(%1), %%mm5\n"
222		"\n"
223		"movq       %%mm1, %%mm7\n"
224		"pmaddwd  208(%1), %%mm1\n"
225		"pmaddwd  216(%1), %%mm7\n"
226		"paddd      %%mm1, %%mm0\n"
227		"paddd      %%mm7, %%mm5\n"
228		"\n"
229		"movq       %%mm2, %%mm7\n"
230		"pmaddwd  240(%1), %%mm2\n"
231		"pmaddwd  248(%1), %%mm7\n"
232		"paddd      %%mm2, %%mm0\n"
233		"paddd      %%mm7, %%mm5\n"
234		"\n"
235		"movq       %%mm3, %%mm7\n"
236		"pmaddwd  272(%1), %%mm3\n"
237		"pmaddwd  280(%1), %%mm7\n"
238		"paddd      %%mm3, %%mm0\n"
239		"paddd      %%mm7, %%mm5\n"
240		"\n"
241		"movq       %%mm0, 16(%3)\n"
242		"movq       %%mm5, 24(%3)\n"
243		:
244		: "r" (in), "r" (consts), "r" (&round_c), "r" (out),
245			"i" (SBC_PROTO_FIXED8_SCALE)
246		: "cc", "memory");
247}
248
249static inline void sbc_analyze_4b_4s_mmx(int16_t *x, int32_t *out,
250						int out_stride)
251{
252	/* Analyze blocks */
253	sbc_analyze_four_mmx(x + 12, out, analysis_consts_fixed4_simd_odd);
254	out += out_stride;
255	sbc_analyze_four_mmx(x + 8, out, analysis_consts_fixed4_simd_even);
256	out += out_stride;
257	sbc_analyze_four_mmx(x + 4, out, analysis_consts_fixed4_simd_odd);
258	out += out_stride;
259	sbc_analyze_four_mmx(x + 0, out, analysis_consts_fixed4_simd_even);
260
261	asm volatile ("emms\n");
262}
263
264static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out,
265						int out_stride)
266{
267	/* Analyze blocks */
268	sbc_analyze_eight_mmx(x + 24, out, analysis_consts_fixed8_simd_odd);
269	out += out_stride;
270	sbc_analyze_eight_mmx(x + 16, out, analysis_consts_fixed8_simd_even);
271	out += out_stride;
272	sbc_analyze_eight_mmx(x + 8, out, analysis_consts_fixed8_simd_odd);
273	out += out_stride;
274	sbc_analyze_eight_mmx(x + 0, out, analysis_consts_fixed8_simd_even);
275
276	asm volatile ("emms\n");
277}
278
279static void sbc_calc_scalefactors_mmx(
280	int32_t sb_sample_f[16][2][8],
281	uint32_t scale_factor[2][8],
282	int blocks, int channels, int subbands)
283{
284	static const SBC_ALIGNED int32_t consts[2] = {
285		1 << SCALE_OUT_BITS,
286		1 << SCALE_OUT_BITS,
287	};
288	int ch, sb;
289	intptr_t blk;
290	for (ch = 0; ch < channels; ch++) {
291		for (sb = 0; sb < subbands; sb += 2) {
292			blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
293				(char *) &sb_sample_f[0][0][0]));
294			asm volatile (
295				"movq         (%4), %%mm0\n"
296			"1:\n"
297				"movq     (%1, %0), %%mm1\n"
298				"pxor        %%mm2, %%mm2\n"
299				"pcmpgtd     %%mm2, %%mm1\n"
300				"paddd    (%1, %0), %%mm1\n"
301				"pcmpgtd     %%mm1, %%mm2\n"
302				"pxor        %%mm2, %%mm1\n"
303
304				"por         %%mm1, %%mm0\n"
305
306				"sub            %2, %0\n"
307				"jns            1b\n"
308
309				"movd        %%mm0, %k0\n"
310				"psrlq         $32, %%mm0\n"
311				"bsrl          %k0, %k0\n"
312				"subl           %5, %k0\n"
313				"movl          %k0, (%3)\n"
314
315				"movd        %%mm0, %k0\n"
316				"bsrl          %k0, %k0\n"
317				"subl           %5, %k0\n"
318				"movl          %k0, 4(%3)\n"
319			: "+r" (blk)
320			: "r" (&sb_sample_f[0][ch][sb]),
321				"i" ((char *) &sb_sample_f[1][0][0] -
322					(char *) &sb_sample_f[0][0][0]),
323				"r" (&scale_factor[ch][sb]),
324				"r" (&consts),
325				"i" (SCALE_OUT_BITS)
326			: "cc", "memory");
327		}
328	}
329	asm volatile ("emms\n");
330}
331
332static int check_mmx_support(void)
333{
334#ifdef __amd64__
335	return 1; /* We assume that all 64-bit processors have MMX support */
336#else
337	int cpuid_feature_information;
338	asm volatile (
339		/* According to Intel manual, CPUID instruction is supported
340		 * if the value of ID bit (bit 21) in EFLAGS can be modified */
341		"pushf\n"
342		"movl     (%%esp),   %0\n"
343		"xorl     $0x200000, (%%esp)\n" /* try to modify ID bit */
344		"popf\n"
345		"pushf\n"
346		"xorl     (%%esp),   %0\n"      /* check if ID bit changed */
347		"jz       1f\n"
348		"push     %%eax\n"
349		"push     %%ebx\n"
350		"push     %%ecx\n"
351		"mov      $1,        %%eax\n"
352		"cpuid\n"
353		"pop      %%ecx\n"
354		"pop      %%ebx\n"
355		"pop      %%eax\n"
356		"1:\n"
357		"popf\n"
358		: "=d" (cpuid_feature_information)
359		:
360		: "cc");
361    return cpuid_feature_information & (1 << 23);
362#endif
363}
364
365void sbc_init_primitives_mmx(struct sbc_encoder_state *state)
366{
367	if (check_mmx_support()) {
368		state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx;
369		state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx;
370		state->sbc_calc_scalefactors = sbc_calc_scalefactors_mmx;
371		state->implementation_info = "MMX";
372	}
373}
374
375#endif
376