1/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009  Josh Coalson
3 * Copyright (C) 2011-2016  Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifdef HAVE_CONFIG_H
34#  include <config.h>
35#endif
36
37#include "private/cpu.h"
38
39#ifndef FLAC__INTEGER_ONLY_LIBRARY
40#ifndef FLAC__NO_ASM
41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42#include "private/lpc.h"
43#ifdef FLAC__SSE_SUPPORTED
44#include "FLAC/assert.h"
45#include "FLAC/format.h"
46
47#include <xmmintrin.h> /* SSE */
48
49/*   new routines: more unaligned loads, less shuffle
50 *   old routines: less unaligned loads, more shuffle
51 *   these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
52 */
53
54/* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
55
56FLAC__SSE_TARGET("sse")
57void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
58{
59	int i;
60	int limit = data_len - 4;
61	__m128 sum0;
62
63	(void) lag;
64	FLAC__ASSERT(lag <= 4);
65	FLAC__ASSERT(lag <= data_len);
66
67	sum0 = _mm_setzero_ps();
68
69	for(i = 0; i <= limit; i++) {
70		__m128 d, d0;
71		d0 = _mm_loadu_ps(data+i);
72		d = d0; d = _mm_shuffle_ps(d, d, 0);
73		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
74	}
75
76	{
77		__m128 d0 = _mm_setzero_ps();
78		limit++; if(limit < 0) limit = 0;
79
80		for(i = data_len-1; i >= limit; i--) {
81			__m128 d;
82			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
83			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
84			d0 = _mm_move_ss(d0, d);
85			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
86		}
87	}
88
89	_mm_storeu_ps(autoc,   sum0);
90}
91
92FLAC__SSE_TARGET("sse")
93void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
94{
95	int i;
96	int limit = data_len - 8;
97	__m128 sum0, sum1;
98
99	(void) lag;
100	FLAC__ASSERT(lag <= 8);
101	FLAC__ASSERT(lag <= data_len);
102
103	sum0 = _mm_setzero_ps();
104	sum1 = _mm_setzero_ps();
105
106	for(i = 0; i <= limit; i++) {
107		__m128 d, d0, d1;
108		d0 = _mm_loadu_ps(data+i);
109		d1 = _mm_loadu_ps(data+i+4);
110		d = d0; d = _mm_shuffle_ps(d, d, 0);
111		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
112		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
113	}
114
115	{
116		__m128 d0 = _mm_setzero_ps();
117		__m128 d1 = _mm_setzero_ps();
118		limit++; if(limit < 0) limit = 0;
119
120		for(i = data_len-1; i >= limit; i--) {
121			__m128 d;
122			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
123			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
124			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
125			d1 = _mm_move_ss(d1, d0);
126			d0 = _mm_move_ss(d0, d);
127			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
128			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
129		}
130	}
131
132	_mm_storeu_ps(autoc,   sum0);
133	_mm_storeu_ps(autoc+4, sum1);
134}
135
136FLAC__SSE_TARGET("sse")
137void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
138{
139	int i;
140	int limit = data_len - 12;
141	__m128 sum0, sum1, sum2;
142
143	(void) lag;
144	FLAC__ASSERT(lag <= 12);
145	FLAC__ASSERT(lag <= data_len);
146
147	sum0 = _mm_setzero_ps();
148	sum1 = _mm_setzero_ps();
149	sum2 = _mm_setzero_ps();
150
151	for(i = 0; i <= limit; i++) {
152		__m128 d, d0, d1, d2;
153		d0 = _mm_loadu_ps(data+i);
154		d1 = _mm_loadu_ps(data+i+4);
155		d2 = _mm_loadu_ps(data+i+8);
156		d = d0; d = _mm_shuffle_ps(d, d, 0);
157		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
158		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
159		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
160	}
161
162	{
163		__m128 d0 = _mm_setzero_ps();
164		__m128 d1 = _mm_setzero_ps();
165		__m128 d2 = _mm_setzero_ps();
166		limit++; if(limit < 0) limit = 0;
167
168		for(i = data_len-1; i >= limit; i--) {
169			__m128 d;
170			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
171			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
172			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
173			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
174			d2 = _mm_move_ss(d2, d1);
175			d1 = _mm_move_ss(d1, d0);
176			d0 = _mm_move_ss(d0, d);
177			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
178			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
179			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
180		}
181	}
182
183	_mm_storeu_ps(autoc,   sum0);
184	_mm_storeu_ps(autoc+4, sum1);
185	_mm_storeu_ps(autoc+8, sum2);
186}
187
188FLAC__SSE_TARGET("sse")
189void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
190{
191	int i;
192	int limit = data_len - 16;
193	__m128 sum0, sum1, sum2, sum3;
194
195	(void) lag;
196	FLAC__ASSERT(lag <= 16);
197	FLAC__ASSERT(lag <= data_len);
198
199	sum0 = _mm_setzero_ps();
200	sum1 = _mm_setzero_ps();
201	sum2 = _mm_setzero_ps();
202	sum3 = _mm_setzero_ps();
203
204	for(i = 0; i <= limit; i++) {
205		__m128 d, d0, d1, d2, d3;
206		d0 = _mm_loadu_ps(data+i);
207		d1 = _mm_loadu_ps(data+i+4);
208		d2 = _mm_loadu_ps(data+i+8);
209		d3 = _mm_loadu_ps(data+i+12);
210		d = d0; d = _mm_shuffle_ps(d, d, 0);
211		sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
212		sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
213		sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
214		sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
215	}
216
217	{
218		__m128 d0 = _mm_setzero_ps();
219		__m128 d1 = _mm_setzero_ps();
220		__m128 d2 = _mm_setzero_ps();
221		__m128 d3 = _mm_setzero_ps();
222		limit++; if(limit < 0) limit = 0;
223
224		for(i = data_len-1; i >= limit; i--) {
225			__m128 d;
226			d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
227			d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
228			d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
229			d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
230			d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
231			d3 = _mm_move_ss(d3, d2);
232			d2 = _mm_move_ss(d2, d1);
233			d1 = _mm_move_ss(d1, d0);
234			d0 = _mm_move_ss(d0, d);
235			sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
236			sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
237			sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
238			sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
239		}
240	}
241
242	_mm_storeu_ps(autoc,   sum0);
243	_mm_storeu_ps(autoc+4, sum1);
244	_mm_storeu_ps(autoc+8, sum2);
245	_mm_storeu_ps(autoc+12,sum3);
246}
247
248/* old routines: faster on older Intel CPUs (up to Core 2) */
249
250FLAC__SSE_TARGET("sse")
251void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
252{
253	__m128 xmm0, xmm2, xmm5;
254
255	(void) lag;
256	FLAC__ASSERT(lag > 0);
257	FLAC__ASSERT(lag <= 4);
258	FLAC__ASSERT(lag <= data_len);
259	FLAC__ASSERT(data_len > 0);
260
261	xmm5 = _mm_setzero_ps();
262
263	xmm0 = _mm_load_ss(data++);
264	xmm2 = xmm0;
265	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
266
267	xmm0 = _mm_mul_ps(xmm0, xmm2);
268	xmm5 = _mm_add_ps(xmm5, xmm0);
269
270	data_len--;
271
272	while(data_len)
273	{
274		xmm0 = _mm_load1_ps(data++);
275
276		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
277		xmm2 = _mm_move_ss(xmm2, xmm0);
278		xmm0 = _mm_mul_ps(xmm0, xmm2);
279		xmm5 = _mm_add_ps(xmm5, xmm0);
280
281		data_len--;
282	}
283
284	_mm_storeu_ps(autoc, xmm5);
285}
286
287FLAC__SSE_TARGET("sse")
288void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
289{
290	__m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
291
292	(void) lag;
293	FLAC__ASSERT(lag > 0);
294	FLAC__ASSERT(lag <= 8);
295	FLAC__ASSERT(lag <= data_len);
296	FLAC__ASSERT(data_len > 0);
297
298	xmm5 = _mm_setzero_ps();
299	xmm6 = _mm_setzero_ps();
300
301	xmm0 = _mm_load_ss(data++);
302	xmm2 = xmm0;
303	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
304	xmm3 = _mm_setzero_ps();
305
306	xmm0 = _mm_mul_ps(xmm0, xmm2);
307	xmm5 = _mm_add_ps(xmm5, xmm0);
308
309	data_len--;
310
311	while(data_len)
312	{
313		xmm0 = _mm_load1_ps(data++);
314
315		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
316		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
317		xmm3 = _mm_move_ss(xmm3, xmm2);
318		xmm2 = _mm_move_ss(xmm2, xmm0);
319
320		xmm1 = xmm0;
321		xmm1 = _mm_mul_ps(xmm1, xmm3);
322		xmm0 = _mm_mul_ps(xmm0, xmm2);
323		xmm6 = _mm_add_ps(xmm6, xmm1);
324		xmm5 = _mm_add_ps(xmm5, xmm0);
325
326		data_len--;
327	}
328
329	_mm_storeu_ps(autoc,   xmm5);
330	_mm_storeu_ps(autoc+4, xmm6);
331}
332
333FLAC__SSE_TARGET("sse")
334void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
335{
336	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
337
338	(void) lag;
339	FLAC__ASSERT(lag > 0);
340	FLAC__ASSERT(lag <= 12);
341	FLAC__ASSERT(lag <= data_len);
342	FLAC__ASSERT(data_len > 0);
343
344	xmm5 = _mm_setzero_ps();
345	xmm6 = _mm_setzero_ps();
346	xmm7 = _mm_setzero_ps();
347
348	xmm0 = _mm_load_ss(data++);
349	xmm2 = xmm0;
350	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
351	xmm3 = _mm_setzero_ps();
352	xmm4 = _mm_setzero_ps();
353
354	xmm0 = _mm_mul_ps(xmm0, xmm2);
355	xmm5 = _mm_add_ps(xmm5, xmm0);
356
357	data_len--;
358
359	while(data_len)
360	{
361		xmm0 = _mm_load1_ps(data++);
362
363		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
364		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
365		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
366		xmm4 = _mm_move_ss(xmm4, xmm3);
367		xmm3 = _mm_move_ss(xmm3, xmm2);
368		xmm2 = _mm_move_ss(xmm2, xmm0);
369
370		xmm1 = xmm0;
371		xmm1 = _mm_mul_ps(xmm1, xmm2);
372		xmm5 = _mm_add_ps(xmm5, xmm1);
373		xmm1 = xmm0;
374		xmm1 = _mm_mul_ps(xmm1, xmm3);
375		xmm6 = _mm_add_ps(xmm6, xmm1);
376		xmm0 = _mm_mul_ps(xmm0, xmm4);
377		xmm7 = _mm_add_ps(xmm7, xmm0);
378
379		data_len--;
380	}
381
382	_mm_storeu_ps(autoc,   xmm5);
383	_mm_storeu_ps(autoc+4, xmm6);
384	_mm_storeu_ps(autoc+8, xmm7);
385}
386
387FLAC__SSE_TARGET("sse")
388void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
389{
390	__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
391
392	(void) lag;
393	FLAC__ASSERT(lag > 0);
394	FLAC__ASSERT(lag <= 16);
395	FLAC__ASSERT(lag <= data_len);
396	FLAC__ASSERT(data_len > 0);
397
398	xmm6 = _mm_setzero_ps();
399	xmm7 = _mm_setzero_ps();
400	xmm8 = _mm_setzero_ps();
401	xmm9 = _mm_setzero_ps();
402
403	xmm0 = _mm_load_ss(data++);
404	xmm2 = xmm0;
405	xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
406	xmm3 = _mm_setzero_ps();
407	xmm4 = _mm_setzero_ps();
408	xmm5 = _mm_setzero_ps();
409
410	xmm0 = _mm_mul_ps(xmm0, xmm2);
411	xmm6 = _mm_add_ps(xmm6, xmm0);
412
413	data_len--;
414
415	while(data_len)
416	{
417		xmm0 = _mm_load1_ps(data++);
418
419		/* shift xmm5:xmm4:xmm3:xmm2 left by one float */
420		xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
421		xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
422		xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
423		xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
424		xmm5 = _mm_move_ss(xmm5, xmm4);
425		xmm4 = _mm_move_ss(xmm4, xmm3);
426		xmm3 = _mm_move_ss(xmm3, xmm2);
427		xmm2 = _mm_move_ss(xmm2, xmm0);
428
429		/* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
430		xmm1 = xmm0;
431		xmm1 = _mm_mul_ps(xmm1, xmm5);
432		xmm9 = _mm_add_ps(xmm9, xmm1);
433		xmm1 = xmm0;
434		xmm1 = _mm_mul_ps(xmm1, xmm4);
435		xmm8 = _mm_add_ps(xmm8, xmm1);
436		xmm1 = xmm0;
437		xmm1 = _mm_mul_ps(xmm1, xmm3);
438		xmm7 = _mm_add_ps(xmm7, xmm1);
439		xmm0 = _mm_mul_ps(xmm0, xmm2);
440		xmm6 = _mm_add_ps(xmm6, xmm0);
441
442		data_len--;
443	}
444
445	_mm_storeu_ps(autoc,   xmm6);
446	_mm_storeu_ps(autoc+4, xmm7);
447	_mm_storeu_ps(autoc+8, xmm8);
448	_mm_storeu_ps(autoc+12,xmm9);
449}
450
451#endif /* FLAC__SSE_SUPPORTED */
452#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
453#endif /* FLAC__NO_ASM */
454#endif /* FLAC__INTEGER_ONLY_LIBRARY */
455