1/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009  Josh Coalson
3 * Copyright (C) 2011-2016  Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifdef HAVE_CONFIG_H
34#  include <config.h>
35#endif
36
37#include "private/cpu.h"
38
39#ifndef FLAC__INTEGER_ONLY_LIBRARY
40#ifndef FLAC__NO_ASM
41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42#include "private/lpc.h"
43#ifdef FLAC__SSE2_SUPPORTED
44
45#include "FLAC/assert.h"
46#include "FLAC/format.h"
47
48#include <emmintrin.h> /* SSE2 */
49
50#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
51#define     DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
52
53#define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54#define     DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
55
56FLAC__SSE_TARGET("sse2")
57void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
58{
59	int i;
60	FLAC__int32 sum;
61	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
62
63	FLAC__ASSERT(order > 0);
64	FLAC__ASSERT(order <= 32);
65
66	if(order <= 12) {
67		if(order > 8) {
68			if(order > 10) {
69				if(order == 12) {
70					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
72					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
73					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
74					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
75					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
76					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
77					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
78					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
79					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
80					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
81					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
82					q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
83
84					for(i = 0; i < (int)data_len-3; i+=4) {
85						__m128i summ, mull;
86						summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
87						mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
88						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
89						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
90						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
91						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
92						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
93						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
94						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
95						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
96						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
97						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
98						summ = _mm_sra_epi32(summ, cnt);
99						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
100					}
101				}
102				else { /* order == 11 */
103					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
104					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
105					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
106					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
107					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
108					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
109					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
110					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
111					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
112					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
113					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
114					q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
115
116					for(i = 0; i < (int)data_len-3; i+=4) {
117						__m128i summ, mull;
118						summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
119						mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
120						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
121						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
122						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
123						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
124						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
125						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
126						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
127						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
128						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
129						summ = _mm_sra_epi32(summ, cnt);
130						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
131					}
132				}
133			}
134			else {
135				if(order == 10) {
136					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
137					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
138					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
139					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
140					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
141					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
142					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
143					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
144					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
145					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
146					q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
147
148					for(i = 0; i < (int)data_len-3; i+=4) {
149						__m128i summ, mull;
150						summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
151						mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
152						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
153						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
154						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
155						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
156						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
157						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
158						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
159						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
160						summ = _mm_sra_epi32(summ, cnt);
161						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
162					}
163				}
164				else { /* order == 9 */
165					__m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
166					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
167					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
168					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
169					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
170					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
171					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
172					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
173					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
174					q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
175
176					for(i = 0; i < (int)data_len-3; i+=4) {
177						__m128i summ, mull;
178						summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
179						mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
180						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
181						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
182						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
183						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
184						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
185						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
186						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
187						summ = _mm_sra_epi32(summ, cnt);
188						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
189					}
190				}
191			}
192		}
193		else if(order > 4) {
194			if(order > 6) {
195				if(order == 8) {
196					__m128i q0, q1, q2, q3, q4, q5, q6, q7;
197					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
198					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
199					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
200					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
201					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
202					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
203					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
204					q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
205
206					for(i = 0; i < (int)data_len-3; i+=4) {
207						__m128i summ, mull;
208						summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
209						mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
210						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
211						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
212						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
213						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
214						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
215						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
216						summ = _mm_sra_epi32(summ, cnt);
217						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
218					}
219				}
220				else { /* order == 7 */
221					__m128i q0, q1, q2, q3, q4, q5, q6;
222					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
223					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
224					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
225					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
226					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
227					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
228					q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
229
230					for(i = 0; i < (int)data_len-3; i+=4) {
231						__m128i summ, mull;
232						summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
233						mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
234						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
235						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
236						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
237						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
238						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
239						summ = _mm_sra_epi32(summ, cnt);
240						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
241					}
242				}
243			}
244			else {
245				if(order == 6) {
246					__m128i q0, q1, q2, q3, q4, q5;
247					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
248					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
249					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
250					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
251					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
252					q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
253
254					for(i = 0; i < (int)data_len-3; i+=4) {
255						__m128i summ, mull;
256						summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
257						mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
258						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
259						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
260						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
261						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
262						summ = _mm_sra_epi32(summ, cnt);
263						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
264					}
265				}
266				else { /* order == 5 */
267					__m128i q0, q1, q2, q3, q4;
268					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
269					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
270					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
271					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
272					q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
273
274					for(i = 0; i < (int)data_len-3; i+=4) {
275						__m128i summ, mull;
276						summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
277						mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
278						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
279						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
280						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
281						summ = _mm_sra_epi32(summ, cnt);
282						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
283					}
284				}
285			}
286		}
287		else {
288			if(order > 2) {
289				if(order == 4) {
290					__m128i q0, q1, q2, q3;
291					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
292					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
293					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
294					q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
295
296					for(i = 0; i < (int)data_len-3; i+=4) {
297						__m128i summ, mull;
298						summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
299						mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
300						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
301						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
302						summ = _mm_sra_epi32(summ, cnt);
303						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
304					}
305				}
306				else { /* order == 3 */
307					__m128i q0, q1, q2;
308					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
309					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
310					q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
311
312					for(i = 0; i < (int)data_len-3; i+=4) {
313						__m128i summ, mull;
314						summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
315						mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
316						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
317						summ = _mm_sra_epi32(summ, cnt);
318						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
319					}
320				}
321			}
322			else {
323				if(order == 2) {
324					__m128i q0, q1;
325					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
326					q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
327
328					for(i = 0; i < (int)data_len-3; i+=4) {
329						__m128i summ, mull;
330						summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
331						mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
332						summ = _mm_sra_epi32(summ, cnt);
333						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
334					}
335				}
336				else { /* order == 1 */
337					__m128i q0;
338					q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
339
340					for(i = 0; i < (int)data_len-3; i+=4) {
341						__m128i summ;
342						summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
343						summ = _mm_sra_epi32(summ, cnt);
344						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
345					}
346				}
347			}
348		}
349		for(; i < (int)data_len; i++) {
350			sum = 0;
351			switch(order) {
352				case 12: sum += qlp_coeff[11] * data[i-12];
353				case 11: sum += qlp_coeff[10] * data[i-11];
354				case 10: sum += qlp_coeff[ 9] * data[i-10];
355				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
356				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
357				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
358				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
359				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
360				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
361				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
362				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
363				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
364			}
365			residual[i] = data[i] - (sum >> lp_quantization);
366		}
367	}
368	else { /* order > 12 */
369		for(i = 0; i < (int)data_len; i++) {
370			sum = 0;
371			switch(order) {
372				case 32: sum += qlp_coeff[31] * data[i-32];
373				case 31: sum += qlp_coeff[30] * data[i-31];
374				case 30: sum += qlp_coeff[29] * data[i-30];
375				case 29: sum += qlp_coeff[28] * data[i-29];
376				case 28: sum += qlp_coeff[27] * data[i-28];
377				case 27: sum += qlp_coeff[26] * data[i-27];
378				case 26: sum += qlp_coeff[25] * data[i-26];
379				case 25: sum += qlp_coeff[24] * data[i-25];
380				case 24: sum += qlp_coeff[23] * data[i-24];
381				case 23: sum += qlp_coeff[22] * data[i-23];
382				case 22: sum += qlp_coeff[21] * data[i-22];
383				case 21: sum += qlp_coeff[20] * data[i-21];
384				case 20: sum += qlp_coeff[19] * data[i-20];
385				case 19: sum += qlp_coeff[18] * data[i-19];
386				case 18: sum += qlp_coeff[17] * data[i-18];
387				case 17: sum += qlp_coeff[16] * data[i-17];
388				case 16: sum += qlp_coeff[15] * data[i-16];
389				case 15: sum += qlp_coeff[14] * data[i-15];
390				case 14: sum += qlp_coeff[13] * data[i-14];
391				case 13: sum += qlp_coeff[12] * data[i-13];
392				         sum += qlp_coeff[11] * data[i-12];
393				         sum += qlp_coeff[10] * data[i-11];
394				         sum += qlp_coeff[ 9] * data[i-10];
395				         sum += qlp_coeff[ 8] * data[i- 9];
396				         sum += qlp_coeff[ 7] * data[i- 8];
397				         sum += qlp_coeff[ 6] * data[i- 7];
398				         sum += qlp_coeff[ 5] * data[i- 6];
399				         sum += qlp_coeff[ 4] * data[i- 5];
400				         sum += qlp_coeff[ 3] * data[i- 4];
401				         sum += qlp_coeff[ 2] * data[i- 3];
402				         sum += qlp_coeff[ 1] * data[i- 2];
403				         sum += qlp_coeff[ 0] * data[i- 1];
404			}
405			residual[i] = data[i] - (sum >> lp_quantization);
406		}
407	}
408}
409
410FLAC__SSE_TARGET("sse2")
411void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
412{
413	int i;
414
415	FLAC__ASSERT(order > 0);
416	FLAC__ASSERT(order <= 32);
417
418	if(order <= 12) {
419		if(order > 8) { /* order == 9, 10, 11, 12 */
420			if(order > 10) { /* order == 11, 12 */
421				if(order == 12) {
422					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
423					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));  // 0  0  q[1]  q[0]
424					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));  // 0  0  q[3]  q[2]
425					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));  // 0  0  q[5]  q[4]
426					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));  // 0  0  q[7]  q[6]
427					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));  // 0  0  q[9]  q[8]
428					xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0  0  q[11] q[10]
429
430					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0  q[1]  0  q[0]
431					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0  q[3]  0  q[2]
432					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0  q[5]  0  q[4]
433					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0  q[7]  0  q[6]
434					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0  q[9]  0  q[8]
435					xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0  q[11] 0  q[10]
436
437					for(i = 0; i < (int)data_len; i++) {
438						//sum = 0;
439						//sum += qlp_coeff[11] * data[i-12];
440						//sum += qlp_coeff[10] * data[i-11];
441						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12));  // 0   0        d[i-11]  d[i-12]
442						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0  d[i-12]   0        d[i-11]
443						xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
444
445						//sum += qlp_coeff[9] * data[i-10];
446						//sum += qlp_coeff[8] * data[i-9];
447						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
448						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
449						xmm6 = _mm_mul_epu32(xmm6, xmm4);
450						xmm7 = _mm_add_epi32(xmm7, xmm6);
451
452						//sum += qlp_coeff[7] * data[i-8];
453						//sum += qlp_coeff[6] * data[i-7];
454						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
455						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
456						xmm6 = _mm_mul_epu32(xmm6, xmm3);
457						xmm7 = _mm_add_epi32(xmm7, xmm6);
458
459						//sum += qlp_coeff[5] * data[i-6];
460						//sum += qlp_coeff[4] * data[i-5];
461						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
462						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
463						xmm6 = _mm_mul_epu32(xmm6, xmm2);
464						xmm7 = _mm_add_epi32(xmm7, xmm6);
465
466						//sum += qlp_coeff[3] * data[i-4];
467						//sum += qlp_coeff[2] * data[i-3];
468						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
469						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
470						xmm6 = _mm_mul_epu32(xmm6, xmm1);
471						xmm7 = _mm_add_epi32(xmm7, xmm6);
472
473						//sum += qlp_coeff[1] * data[i-2];
474						//sum += qlp_coeff[0] * data[i-1];
475						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
476						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
477						xmm6 = _mm_mul_epu32(xmm6, xmm0);
478						xmm7 = _mm_add_epi32(xmm7, xmm6);
479
480						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
481						RESIDUAL32_RESULT(xmm7);
482					}
483				}
484				else { /* order == 11 */
485					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
486					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
487					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
488					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
489					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
490					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
491					xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
492
493					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
494					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
495					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
496					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
497					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
498
499					for(i = 0; i < (int)data_len; i++) {
500						//sum = 0;
501						//sum  = qlp_coeff[10] * data[i-11];
502						xmm7 = _mm_cvtsi32_si128(data[i-11]);
503						xmm7 = _mm_mul_epu32(xmm7, xmm5);
504
505						//sum += qlp_coeff[9] * data[i-10];
506						//sum += qlp_coeff[8] * data[i-9];
507						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
508						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
509						xmm6 = _mm_mul_epu32(xmm6, xmm4);
510						xmm7 = _mm_add_epi32(xmm7, xmm6);
511
512						//sum += qlp_coeff[7] * data[i-8];
513						//sum += qlp_coeff[6] * data[i-7];
514						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
515						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
516						xmm6 = _mm_mul_epu32(xmm6, xmm3);
517						xmm7 = _mm_add_epi32(xmm7, xmm6);
518
519						//sum += qlp_coeff[5] * data[i-6];
520						//sum += qlp_coeff[4] * data[i-5];
521						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
522						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
523						xmm6 = _mm_mul_epu32(xmm6, xmm2);
524						xmm7 = _mm_add_epi32(xmm7, xmm6);
525
526						//sum += qlp_coeff[3] * data[i-4];
527						//sum += qlp_coeff[2] * data[i-3];
528						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
529						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
530						xmm6 = _mm_mul_epu32(xmm6, xmm1);
531						xmm7 = _mm_add_epi32(xmm7, xmm6);
532
533						//sum += qlp_coeff[1] * data[i-2];
534						//sum += qlp_coeff[0] * data[i-1];
535						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
536						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
537						xmm6 = _mm_mul_epu32(xmm6, xmm0);
538						xmm7 = _mm_add_epi32(xmm7, xmm6);
539
540						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
541						RESIDUAL32_RESULT(xmm7);
542					}
543				}
544			}
545			else { /* order == 9, 10 */
546				if(order == 10) {
547					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
548					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
549					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
550					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
551					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
552					xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
553
554					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
555					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
556					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
557					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
558					xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
559
560					for(i = 0; i < (int)data_len; i++) {
561						//sum = 0;
562						//sum += qlp_coeff[9] * data[i-10];
563						//sum += qlp_coeff[8] * data[i-9];
564						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
565						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
566						xmm7 = _mm_mul_epu32(xmm7, xmm4);
567
568						//sum += qlp_coeff[7] * data[i-8];
569						//sum += qlp_coeff[6] * data[i-7];
570						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
571						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
572						xmm6 = _mm_mul_epu32(xmm6, xmm3);
573						xmm7 = _mm_add_epi32(xmm7, xmm6);
574
575						//sum += qlp_coeff[5] * data[i-6];
576						//sum += qlp_coeff[4] * data[i-5];
577						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
578						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
579						xmm6 = _mm_mul_epu32(xmm6, xmm2);
580						xmm7 = _mm_add_epi32(xmm7, xmm6);
581
582						//sum += qlp_coeff[3] * data[i-4];
583						//sum += qlp_coeff[2] * data[i-3];
584						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
585						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
586						xmm6 = _mm_mul_epu32(xmm6, xmm1);
587						xmm7 = _mm_add_epi32(xmm7, xmm6);
588
589						//sum += qlp_coeff[1] * data[i-2];
590						//sum += qlp_coeff[0] * data[i-1];
591						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
592						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
593						xmm6 = _mm_mul_epu32(xmm6, xmm0);
594						xmm7 = _mm_add_epi32(xmm7, xmm6);
595
596						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
597						RESIDUAL32_RESULT(xmm7);
598					}
599				}
600				else { /* order == 9 */
601					__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
602					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
603					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
604					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
605					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
606					xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
607
608					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
609					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
610					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
611					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
612
613					for(i = 0; i < (int)data_len; i++) {
614						//sum = 0;
615						//sum  = qlp_coeff[8] * data[i-9];
616						xmm7 = _mm_cvtsi32_si128(data[i-9]);
617						xmm7 = _mm_mul_epu32(xmm7, xmm4);
618
619						//sum += qlp_coeff[7] * data[i-8];
620						//sum += qlp_coeff[6] * data[i-7];
621						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
622						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
623						xmm6 = _mm_mul_epu32(xmm6, xmm3);
624						xmm7 = _mm_add_epi32(xmm7, xmm6);
625
626						//sum += qlp_coeff[5] * data[i-6];
627						//sum += qlp_coeff[4] * data[i-5];
628						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
629						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
630						xmm6 = _mm_mul_epu32(xmm6, xmm2);
631						xmm7 = _mm_add_epi32(xmm7, xmm6);
632
633						//sum += qlp_coeff[3] * data[i-4];
634						//sum += qlp_coeff[2] * data[i-3];
635						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
636						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
637						xmm6 = _mm_mul_epu32(xmm6, xmm1);
638						xmm7 = _mm_add_epi32(xmm7, xmm6);
639
640						//sum += qlp_coeff[1] * data[i-2];
641						//sum += qlp_coeff[0] * data[i-1];
642						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
643						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
644						xmm6 = _mm_mul_epu32(xmm6, xmm0);
645						xmm7 = _mm_add_epi32(xmm7, xmm6);
646
647						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
648						RESIDUAL32_RESULT(xmm7);
649					}
650				}
651			}
652		}
653		else if(order > 4) { /* order == 5, 6, 7, 8 */
654			if(order > 6) { /* order == 7, 8 */
655				if(order == 8) {
656					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
657					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
658					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
659					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
660					xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
661
662					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
663					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
664					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
665					xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
666
667					for(i = 0; i < (int)data_len; i++) {
668						//sum = 0;
669						//sum += qlp_coeff[7] * data[i-8];
670						//sum += qlp_coeff[6] * data[i-7];
671						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
672						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
673						xmm7 = _mm_mul_epu32(xmm7, xmm3);
674
675						//sum += qlp_coeff[5] * data[i-6];
676						//sum += qlp_coeff[4] * data[i-5];
677						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
678						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
679						xmm6 = _mm_mul_epu32(xmm6, xmm2);
680						xmm7 = _mm_add_epi32(xmm7, xmm6);
681
682						//sum += qlp_coeff[3] * data[i-4];
683						//sum += qlp_coeff[2] * data[i-3];
684						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
685						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
686						xmm6 = _mm_mul_epu32(xmm6, xmm1);
687						xmm7 = _mm_add_epi32(xmm7, xmm6);
688
689						//sum += qlp_coeff[1] * data[i-2];
690						//sum += qlp_coeff[0] * data[i-1];
691						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
692						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
693						xmm6 = _mm_mul_epu32(xmm6, xmm0);
694						xmm7 = _mm_add_epi32(xmm7, xmm6);
695
696						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
697						RESIDUAL32_RESULT(xmm7);
698					}
699				}
700				else { /* order == 7 */
701					__m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
702					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
703					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
704					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
705					xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
706
707					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
708					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
709					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
710
711					for(i = 0; i < (int)data_len; i++) {
712						//sum = 0;
713						//sum  = qlp_coeff[6] * data[i-7];
714						xmm7 = _mm_cvtsi32_si128(data[i-7]);
715						xmm7 = _mm_mul_epu32(xmm7, xmm3);
716
717						//sum += qlp_coeff[5] * data[i-6];
718						//sum += qlp_coeff[4] * data[i-5];
719						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
720						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
721						xmm6 = _mm_mul_epu32(xmm6, xmm2);
722						xmm7 = _mm_add_epi32(xmm7, xmm6);
723
724						//sum += qlp_coeff[3] * data[i-4];
725						//sum += qlp_coeff[2] * data[i-3];
726						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
727						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
728						xmm6 = _mm_mul_epu32(xmm6, xmm1);
729						xmm7 = _mm_add_epi32(xmm7, xmm6);
730
731						//sum += qlp_coeff[1] * data[i-2];
732						//sum += qlp_coeff[0] * data[i-1];
733						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
734						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
735						xmm6 = _mm_mul_epu32(xmm6, xmm0);
736						xmm7 = _mm_add_epi32(xmm7, xmm6);
737
738						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
739						RESIDUAL32_RESULT(xmm7);
740					}
741				}
742			}
743			else { /* order == 5, 6 */
744				if(order == 6) {
745					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
746					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
747					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
748					xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
749
750					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
751					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
752					xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
753
754					for(i = 0; i < (int)data_len; i++) {
755						//sum = 0;
756						//sum += qlp_coeff[5] * data[i-6];
757						//sum += qlp_coeff[4] * data[i-5];
758						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
759						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
760						xmm7 = _mm_mul_epu32(xmm7, xmm2);
761
762						//sum += qlp_coeff[3] * data[i-4];
763						//sum += qlp_coeff[2] * data[i-3];
764						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
765						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
766						xmm6 = _mm_mul_epu32(xmm6, xmm1);
767						xmm7 = _mm_add_epi32(xmm7, xmm6);
768
769						//sum += qlp_coeff[1] * data[i-2];
770						//sum += qlp_coeff[0] * data[i-1];
771						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
772						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
773						xmm6 = _mm_mul_epu32(xmm6, xmm0);
774						xmm7 = _mm_add_epi32(xmm7, xmm6);
775
776						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
777						RESIDUAL32_RESULT(xmm7);
778					}
779				}
780				else { /* order == 5 */
781					__m128i xmm0, xmm1, xmm2, xmm6, xmm7;
782					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
783					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
784					xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
785
786					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
787					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
788
789					for(i = 0; i < (int)data_len; i++) {
790						//sum = 0;
791						//sum  = qlp_coeff[4] * data[i-5];
792						xmm7 = _mm_cvtsi32_si128(data[i-5]);
793						xmm7 = _mm_mul_epu32(xmm7, xmm2);
794
795						//sum += qlp_coeff[3] * data[i-4];
796						//sum += qlp_coeff[2] * data[i-3];
797						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
798						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799						xmm6 = _mm_mul_epu32(xmm6, xmm1);
800						xmm7 = _mm_add_epi32(xmm7, xmm6);
801
802						//sum += qlp_coeff[1] * data[i-2];
803						//sum += qlp_coeff[0] * data[i-1];
804						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
805						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806						xmm6 = _mm_mul_epu32(xmm6, xmm0);
807						xmm7 = _mm_add_epi32(xmm7, xmm6);
808
809						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
810						RESIDUAL32_RESULT(xmm7);
811					}
812				}
813			}
814		}
815		else { /* order == 1, 2, 3, 4 */
816			if(order > 2) { /* order == 3, 4 */
817				if(order == 4) {
818					__m128i xmm0, xmm1, xmm6, xmm7;
819					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
820					xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
821
822					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
823					xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
824
825					for(i = 0; i < (int)data_len; i++) {
826						//sum = 0;
827						//sum += qlp_coeff[3] * data[i-4];
828						//sum += qlp_coeff[2] * data[i-3];
829						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
830						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
831						xmm7 = _mm_mul_epu32(xmm7, xmm1);
832
833						//sum += qlp_coeff[1] * data[i-2];
834						//sum += qlp_coeff[0] * data[i-1];
835						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
836						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
837						xmm6 = _mm_mul_epu32(xmm6, xmm0);
838						xmm7 = _mm_add_epi32(xmm7, xmm6);
839
840						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
841						RESIDUAL32_RESULT(xmm7);
842					}
843				}
844				else { /* order == 3 */
845					__m128i xmm0, xmm1, xmm6, xmm7;
846					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
847					xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
848
849					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
850
851					for(i = 0; i < (int)data_len; i++) {
852						//sum = 0;
853						//sum  = qlp_coeff[2] * data[i-3];
854						xmm7 = _mm_cvtsi32_si128(data[i-3]);
855						xmm7 = _mm_mul_epu32(xmm7, xmm1);
856
857						//sum += qlp_coeff[1] * data[i-2];
858						//sum += qlp_coeff[0] * data[i-1];
859						xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
860						xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
861						xmm6 = _mm_mul_epu32(xmm6, xmm0);
862						xmm7 = _mm_add_epi32(xmm7, xmm6);
863
864						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
865						RESIDUAL32_RESULT(xmm7);
866					}
867				}
868			}
869			else { /* order == 1, 2 */
870				if(order == 2) {
871					__m128i xmm0, xmm7;
872					xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
873					xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
874
875					for(i = 0; i < (int)data_len; i++) {
876						//sum = 0;
877						//sum += qlp_coeff[1] * data[i-2];
878						//sum += qlp_coeff[0] * data[i-1];
879						xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
880						xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
881						xmm7 = _mm_mul_epu32(xmm7, xmm0);
882
883						xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
884						RESIDUAL32_RESULT(xmm7);
885					}
886				}
887				else { /* order == 1 */
888					for(i = 0; i < (int)data_len; i++)
889						residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
890				}
891			}
892		}
893	}
894	else { /* order > 12 */
895		FLAC__int32 sum;
896		for(i = 0; i < (int)data_len; i++) {
897			sum = 0;
898			switch(order) {
899				case 32: sum += qlp_coeff[31] * data[i-32];
900				case 31: sum += qlp_coeff[30] * data[i-31];
901				case 30: sum += qlp_coeff[29] * data[i-30];
902				case 29: sum += qlp_coeff[28] * data[i-29];
903				case 28: sum += qlp_coeff[27] * data[i-28];
904				case 27: sum += qlp_coeff[26] * data[i-27];
905				case 26: sum += qlp_coeff[25] * data[i-26];
906				case 25: sum += qlp_coeff[24] * data[i-25];
907				case 24: sum += qlp_coeff[23] * data[i-24];
908				case 23: sum += qlp_coeff[22] * data[i-23];
909				case 22: sum += qlp_coeff[21] * data[i-22];
910				case 21: sum += qlp_coeff[20] * data[i-21];
911				case 20: sum += qlp_coeff[19] * data[i-20];
912				case 19: sum += qlp_coeff[18] * data[i-19];
913				case 18: sum += qlp_coeff[17] * data[i-18];
914				case 17: sum += qlp_coeff[16] * data[i-17];
915				case 16: sum += qlp_coeff[15] * data[i-16];
916				case 15: sum += qlp_coeff[14] * data[i-15];
917				case 14: sum += qlp_coeff[13] * data[i-14];
918				case 13: sum += qlp_coeff[12] * data[i-13];
919				         sum += qlp_coeff[11] * data[i-12];
920				         sum += qlp_coeff[10] * data[i-11];
921				         sum += qlp_coeff[ 9] * data[i-10];
922				         sum += qlp_coeff[ 8] * data[i- 9];
923				         sum += qlp_coeff[ 7] * data[i- 8];
924				         sum += qlp_coeff[ 6] * data[i- 7];
925				         sum += qlp_coeff[ 5] * data[i- 6];
926				         sum += qlp_coeff[ 4] * data[i- 5];
927				         sum += qlp_coeff[ 3] * data[i- 4];
928				         sum += qlp_coeff[ 2] * data[i- 3];
929				         sum += qlp_coeff[ 1] * data[i- 2];
930				         sum += qlp_coeff[ 0] * data[i- 1];
931			}
932			residual[i] = data[i] - (sum >> lp_quantization);
933		}
934	}
935}
936
937#if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
938
939FLAC__SSE_TARGET("sse2")
940void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
941{
942	if (order < 8 || order > 12) {
943		FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
944		return;
945	}
946	if (data_len == 0)
947		return;
948
949	FLAC__ASSERT(order >= 8);
950	FLAC__ASSERT(order <= 12);
951
952	if(order > 8) { /* order == 9, 10, 11, 12 */
953		FLAC__int32 curr;
954		__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
955		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
956		xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
957		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
958		switch(order)                                          /* ...and zero them out */
959		{
960		case 9:
961			xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
962		case 10:
963			xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
964		case 11:
965			xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
966		}
967		xmm2 = _mm_setzero_si128();
968		xmm0 = _mm_packs_epi32(xmm0, xmm6);
969		xmm1 = _mm_packs_epi32(xmm1, xmm2);
970
971		xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
972		xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
973		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
974		xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
975		xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
976		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
977		xmm4 = _mm_packs_epi32(xmm4, xmm2);
978		xmm3 = _mm_packs_epi32(xmm3, xmm5);
979
980		xmm7 = _mm_slli_si128(xmm1, 2);
981		xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
982		xmm2 = _mm_slli_si128(xmm0, 2);
983
984		/* xmm0, xmm1: qlp_coeff
985			xmm2, xmm7: qlp_coeff << 16 bit
986			xmm3, xmm4: data */
987
988		xmm5 = _mm_madd_epi16(xmm4, xmm1);
989		xmm6 = _mm_madd_epi16(xmm3, xmm0);
990		xmm6 = _mm_add_epi32(xmm6, xmm5);
991		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
992		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
993
994		DATA16_RESULT(xmm6);
995
996		data_len--;
997
998		if(data_len % 2) {
999			xmm6 = _mm_srli_si128(xmm3, 14);
1000			xmm4 = _mm_slli_si128(xmm4, 2);
1001			xmm3 = _mm_slli_si128(xmm3, 2);
1002			xmm4 = _mm_or_si128(xmm4, xmm6);
1003			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1004
1005			xmm5 = _mm_madd_epi16(xmm4, xmm1);
1006			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1007			xmm6 = _mm_add_epi32(xmm6, xmm5);
1008			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1009			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1010
1011			DATA16_RESULT(xmm6);
1012
1013			data_len--;
1014		}
1015
1016		while(data_len) { /* data_len is a multiple of 2 */
1017			/* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1018			xmm6 = _mm_srli_si128(xmm3, 12);
1019			xmm4 = _mm_slli_si128(xmm4, 4);
1020			xmm3 = _mm_slli_si128(xmm3, 4);
1021			xmm4 = _mm_or_si128(xmm4, xmm6);
1022			xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1023
1024			xmm5 = _mm_madd_epi16(xmm4, xmm7);
1025			xmm6 = _mm_madd_epi16(xmm3, xmm2);
1026			xmm6 = _mm_add_epi32(xmm6, xmm5);
1027			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1028			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1029
1030			DATA16_RESULT(xmm6);
1031
1032			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1033
1034			xmm5 = _mm_madd_epi16(xmm4, xmm1);
1035			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1036			xmm6 = _mm_add_epi32(xmm6, xmm5);
1037			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1038			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1039
1040			DATA16_RESULT(xmm6);
1041
1042			data_len-=2;
1043		}
1044	} /* endif(order > 8) */
1045	else
1046	{
1047		FLAC__int32 curr;
1048		__m128i xmm0, xmm1, xmm3, xmm6;
1049		xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1050		xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1051		xmm0 = _mm_packs_epi32(xmm0, xmm1);
1052
1053		xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1054		xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1055		xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1056		xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1057		xmm3 = _mm_packs_epi32(xmm3, xmm1);
1058
1059		/* xmm0: qlp_coeff
1060			xmm3: data */
1061
1062		xmm6 = _mm_madd_epi16(xmm3, xmm0);
1063		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1064		xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1065
1066		DATA16_RESULT(xmm6);
1067
1068		data_len--;
1069
1070		while(data_len) {
1071			xmm3 = _mm_slli_si128(xmm3, 2);
1072			xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1073
1074			xmm6 = _mm_madd_epi16(xmm3, xmm0);
1075			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1076			xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1077
1078			DATA16_RESULT(xmm6);
1079
1080			data_len--;
1081		}
1082	}
1083}
1084
1085#endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1086
1087#endif /* FLAC__SSE2_SUPPORTED */
1088#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1089#endif /* FLAC__NO_ASM */
1090#endif /* FLAC__INTEGER_ONLY_LIBRARY */
1091