1/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009  Josh Coalson
3 * Copyright (C) 2011-2016  Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifdef HAVE_CONFIG_H
34#  include <config.h>
35#endif
36
37#include "private/cpu.h"
38
39#ifndef FLAC__INTEGER_ONLY_LIBRARY
40#ifndef FLAC__NO_ASM
41#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42#include "private/lpc.h"
43#ifdef FLAC__AVX2_SUPPORTED
44
45#include "FLAC/assert.h"
46#include "FLAC/format.h"
47
48#include <immintrin.h> /* AVX2 */
49
50FLAC__SSE_TARGET("avx2")
51void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
52{
53	int i;
54	FLAC__int32 sum;
55	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
56
57	FLAC__ASSERT(order > 0);
58	FLAC__ASSERT(order <= 32);
59
60	if(order <= 12) {
61		if(order > 8) {
62			if(order > 10) {
63				if(order == 12) {
64					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
65					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
66					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
67					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
68					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
69					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
70					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
71					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
72					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
73					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
74					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
75					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
76					q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
77
78					for(i = 0; i < (int)data_len-7; i+=8) {
79						__m256i summ, mull;
80						summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
81						mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
82						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
83						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
84						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
85						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
86						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
87						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
88						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
89						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
90						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
91						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
92						summ = _mm256_sra_epi32(summ, cnt);
93						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
94					}
95				}
96				else { /* order == 11 */
97					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
98					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
99					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
100					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
101					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
102					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
103					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
104					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
105					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
106					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
107					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
108					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
109
110					for(i = 0; i < (int)data_len-7; i+=8) {
111						__m256i summ, mull;
112						summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
113						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
114						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
115						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
116						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
117						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
118						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
119						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
120						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
121						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
122						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
123						summ = _mm256_sra_epi32(summ, cnt);
124						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
125					}
126				}
127			}
128			else {
129				if(order == 10) {
130					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
131					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
132					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
133					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
134					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
135					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
136					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
137					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
138					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
139					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
140					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
141
142					for(i = 0; i < (int)data_len-7; i+=8) {
143						__m256i summ, mull;
144						summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
145						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
146						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
147						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
148						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
149						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
150						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
151						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
152						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
153						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
154						summ = _mm256_sra_epi32(summ, cnt);
155						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
156					}
157				}
158				else { /* order == 9 */
159					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
160					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
161					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
162					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
163					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
164					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
165					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
166					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
167					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
168					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
169
170					for(i = 0; i < (int)data_len-7; i+=8) {
171						__m256i summ, mull;
172						summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
173						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
174						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
175						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
176						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
177						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
178						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
179						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
180						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
181						summ = _mm256_sra_epi32(summ, cnt);
182						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
183					}
184				}
185			}
186		}
187		else if(order > 4) {
188			if(order > 6) {
189				if(order == 8) {
190					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
191					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
192					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
193					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
194					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
195					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
196					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
197					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
198					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
199
200					for(i = 0; i < (int)data_len-7; i+=8) {
201						__m256i summ, mull;
202						summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
203						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
204						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
205						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
206						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
207						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
208						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
209						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
210						summ = _mm256_sra_epi32(summ, cnt);
211						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
212					}
213				}
214				else { /* order == 7 */
215					__m256i q0, q1, q2, q3, q4, q5, q6;
216					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
217					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
218					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
219					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
220					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
221					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
222					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
223
224					for(i = 0; i < (int)data_len-7; i+=8) {
225						__m256i summ, mull;
226						summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
227						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
228						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
229						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
230						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
231						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
232						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
233						summ = _mm256_sra_epi32(summ, cnt);
234						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
235					}
236				}
237			}
238			else {
239				if(order == 6) {
240					__m256i q0, q1, q2, q3, q4, q5;
241					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
242					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
243					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
244					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
245					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
246					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
247
248					for(i = 0; i < (int)data_len-7; i+=8) {
249						__m256i summ, mull;
250						summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
251						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
252						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
253						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
254						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
255						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
256						summ = _mm256_sra_epi32(summ, cnt);
257						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
258					}
259				}
260				else { /* order == 5 */
261					__m256i q0, q1, q2, q3, q4;
262					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
263					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
264					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
265					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
266					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
267
268					for(i = 0; i < (int)data_len-7; i+=8) {
269						__m256i summ, mull;
270						summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
271						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
272						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
273						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
274						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
275						summ = _mm256_sra_epi32(summ, cnt);
276						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
277					}
278				}
279			}
280		}
281		else {
282			if(order > 2) {
283				if(order == 4) {
284					__m256i q0, q1, q2, q3;
285					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
286					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
287					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
288					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
289
290					for(i = 0; i < (int)data_len-7; i+=8) {
291						__m256i summ, mull;
292						summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
293						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
294						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
295						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
296						summ = _mm256_sra_epi32(summ, cnt);
297						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
298					}
299				}
300				else { /* order == 3 */
301					__m256i q0, q1, q2;
302					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
303					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
304					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
305
306					for(i = 0; i < (int)data_len-7; i+=8) {
307						__m256i summ, mull;
308						summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
309						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
310						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
311						summ = _mm256_sra_epi32(summ, cnt);
312						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
313					}
314				}
315			}
316			else {
317				if(order == 2) {
318					__m256i q0, q1;
319					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
320					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
321
322					for(i = 0; i < (int)data_len-7; i+=8) {
323						__m256i summ, mull;
324						summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
325						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
326						summ = _mm256_sra_epi32(summ, cnt);
327						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
328					}
329				}
330				else { /* order == 1 */
331					__m256i q0;
332					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
333
334					for(i = 0; i < (int)data_len-7; i+=8) {
335						__m256i summ;
336						summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
337						summ = _mm256_sra_epi32(summ, cnt);
338						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
339					}
340				}
341			}
342		}
343		for(; i < (int)data_len; i++) {
344			sum = 0;
345			switch(order) {
346				case 12: sum += qlp_coeff[11] * data[i-12];
347				case 11: sum += qlp_coeff[10] * data[i-11];
348				case 10: sum += qlp_coeff[ 9] * data[i-10];
349				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
350				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
351				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
352				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
353				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
354				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
355				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
356				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
357				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
358			}
359			residual[i] = data[i] - (sum >> lp_quantization);
360		}
361	}
362	else { /* order > 12 */
363		for(i = 0; i < (int)data_len; i++) {
364			sum = 0;
365			switch(order) {
366				case 32: sum += qlp_coeff[31] * data[i-32];
367				case 31: sum += qlp_coeff[30] * data[i-31];
368				case 30: sum += qlp_coeff[29] * data[i-30];
369				case 29: sum += qlp_coeff[28] * data[i-29];
370				case 28: sum += qlp_coeff[27] * data[i-28];
371				case 27: sum += qlp_coeff[26] * data[i-27];
372				case 26: sum += qlp_coeff[25] * data[i-26];
373				case 25: sum += qlp_coeff[24] * data[i-25];
374				case 24: sum += qlp_coeff[23] * data[i-24];
375				case 23: sum += qlp_coeff[22] * data[i-23];
376				case 22: sum += qlp_coeff[21] * data[i-22];
377				case 21: sum += qlp_coeff[20] * data[i-21];
378				case 20: sum += qlp_coeff[19] * data[i-20];
379				case 19: sum += qlp_coeff[18] * data[i-19];
380				case 18: sum += qlp_coeff[17] * data[i-18];
381				case 17: sum += qlp_coeff[16] * data[i-17];
382				case 16: sum += qlp_coeff[15] * data[i-16];
383				case 15: sum += qlp_coeff[14] * data[i-15];
384				case 14: sum += qlp_coeff[13] * data[i-14];
385				case 13: sum += qlp_coeff[12] * data[i-13];
386				         sum += qlp_coeff[11] * data[i-12];
387				         sum += qlp_coeff[10] * data[i-11];
388				         sum += qlp_coeff[ 9] * data[i-10];
389				         sum += qlp_coeff[ 8] * data[i- 9];
390				         sum += qlp_coeff[ 7] * data[i- 8];
391				         sum += qlp_coeff[ 6] * data[i- 7];
392				         sum += qlp_coeff[ 5] * data[i- 6];
393				         sum += qlp_coeff[ 4] * data[i- 5];
394				         sum += qlp_coeff[ 3] * data[i- 4];
395				         sum += qlp_coeff[ 2] * data[i- 3];
396				         sum += qlp_coeff[ 1] * data[i- 2];
397				         sum += qlp_coeff[ 0] * data[i- 1];
398			}
399			residual[i] = data[i] - (sum >> lp_quantization);
400		}
401	}
402	_mm256_zeroupper();
403}
404
405FLAC__SSE_TARGET("avx2")
406void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
407{
408	int i;
409	FLAC__int32 sum;
410	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
411
412	FLAC__ASSERT(order > 0);
413	FLAC__ASSERT(order <= 32);
414
415	if(order <= 12) {
416		if(order > 8) {
417			if(order > 10) {
418				if(order == 12) {
419					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
420					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
421					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
422					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
423					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
424					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
425					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
426					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
427					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
428					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
429					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
430					q10 = _mm256_set1_epi32(qlp_coeff[10]);
431					q11 = _mm256_set1_epi32(qlp_coeff[11]);
432
433					for(i = 0; i < (int)data_len-7; i+=8) {
434						__m256i summ, mull;
435						summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
436						mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
437						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
438						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
439						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
440						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
441						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
442						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
443						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
444						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
445						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
446						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
447						summ = _mm256_sra_epi32(summ, cnt);
448						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
449					}
450				}
451				else { /* order == 11 */
452					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
453					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
454					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
455					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
456					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
457					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
458					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
459					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
460					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
461					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
462					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
463					q10 = _mm256_set1_epi32(qlp_coeff[10]);
464
465					for(i = 0; i < (int)data_len-7; i+=8) {
466						__m256i summ, mull;
467						summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
468						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
469						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
470						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
471						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
472						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
473						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
474						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
475						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
476						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
477						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
478						summ = _mm256_sra_epi32(summ, cnt);
479						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
480					}
481				}
482			}
483			else {
484				if(order == 10) {
485					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
486					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
487					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
488					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
489					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
490					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
491					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
492					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
493					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
494					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
495					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
496
497					for(i = 0; i < (int)data_len-7; i+=8) {
498						__m256i summ, mull;
499						summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
500						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
501						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
502						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
503						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
504						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
505						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
506						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
507						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
508						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
509						summ = _mm256_sra_epi32(summ, cnt);
510						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
511					}
512				}
513				else { /* order == 9 */
514					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
515					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
516					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
517					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
518					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
519					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
520					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
521					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
522					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
523					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
524
525					for(i = 0; i < (int)data_len-7; i+=8) {
526						__m256i summ, mull;
527						summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
528						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
529						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
530						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
531						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
532						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
533						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
534						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
535						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
536						summ = _mm256_sra_epi32(summ, cnt);
537						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
538					}
539				}
540			}
541		}
542		else if(order > 4) {
543			if(order > 6) {
544				if(order == 8) {
545					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
546					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
547					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
548					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
549					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
550					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
551					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
552					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
553					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
554
555					for(i = 0; i < (int)data_len-7; i+=8) {
556						__m256i summ, mull;
557						summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
558						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
559						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
560						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
561						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
562						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
563						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
564						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
565						summ = _mm256_sra_epi32(summ, cnt);
566						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
567					}
568				}
569				else { /* order == 7 */
570					__m256i q0, q1, q2, q3, q4, q5, q6;
571					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
572					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
573					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
574					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
575					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
576					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
577					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
578
579					for(i = 0; i < (int)data_len-7; i+=8) {
580						__m256i summ, mull;
581						summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
582						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
583						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
584						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
585						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
586						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
587						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
588						summ = _mm256_sra_epi32(summ, cnt);
589						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
590					}
591				}
592			}
593			else {
594				if(order == 6) {
595					__m256i q0, q1, q2, q3, q4, q5;
596					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
597					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
598					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
599					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
600					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
601					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
602
603					for(i = 0; i < (int)data_len-7; i+=8) {
604						__m256i summ, mull;
605						summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
606						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
607						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
608						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
609						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
610						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
611						summ = _mm256_sra_epi32(summ, cnt);
612						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
613					}
614				}
615				else { /* order == 5 */
616					__m256i q0, q1, q2, q3, q4;
617					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
618					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
619					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
620					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
621					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
622
623					for(i = 0; i < (int)data_len-7; i+=8) {
624						__m256i summ, mull;
625						summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
626						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
627						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
628						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
629						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
630						summ = _mm256_sra_epi32(summ, cnt);
631						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
632					}
633				}
634			}
635		}
636		else {
637			if(order > 2) {
638				if(order == 4) {
639					__m256i q0, q1, q2, q3;
640					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
641					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
642					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
643					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
644
645					for(i = 0; i < (int)data_len-7; i+=8) {
646						__m256i summ, mull;
647						summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
648						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
649						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
650						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
651						summ = _mm256_sra_epi32(summ, cnt);
652						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
653					}
654				}
655				else { /* order == 3 */
656					__m256i q0, q1, q2;
657					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
658					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
659					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
660
661					for(i = 0; i < (int)data_len-7; i+=8) {
662						__m256i summ, mull;
663						summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
664						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
665						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
666						summ = _mm256_sra_epi32(summ, cnt);
667						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
668					}
669				}
670			}
671			else {
672				if(order == 2) {
673					__m256i q0, q1;
674					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
675					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
676
677					for(i = 0; i < (int)data_len-7; i+=8) {
678						__m256i summ, mull;
679						summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
680						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
681						summ = _mm256_sra_epi32(summ, cnt);
682						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
683					}
684				}
685				else { /* order == 1 */
686					__m256i q0;
687					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
688
689					for(i = 0; i < (int)data_len-7; i+=8) {
690						__m256i summ;
691						summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
692						summ = _mm256_sra_epi32(summ, cnt);
693						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
694					}
695				}
696			}
697		}
698		for(; i < (int)data_len; i++) {
699			sum = 0;
700			switch(order) {
701				case 12: sum += qlp_coeff[11] * data[i-12];
702				case 11: sum += qlp_coeff[10] * data[i-11];
703				case 10: sum += qlp_coeff[ 9] * data[i-10];
704				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
705				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
706				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
707				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
708				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
709				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
710				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
711				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
712				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
713			}
714			residual[i] = data[i] - (sum >> lp_quantization);
715		}
716	}
717	else { /* order > 12 */
718		for(i = 0; i < (int)data_len; i++) {
719			sum = 0;
720			switch(order) {
721				case 32: sum += qlp_coeff[31] * data[i-32];
722				case 31: sum += qlp_coeff[30] * data[i-31];
723				case 30: sum += qlp_coeff[29] * data[i-30];
724				case 29: sum += qlp_coeff[28] * data[i-29];
725				case 28: sum += qlp_coeff[27] * data[i-28];
726				case 27: sum += qlp_coeff[26] * data[i-27];
727				case 26: sum += qlp_coeff[25] * data[i-26];
728				case 25: sum += qlp_coeff[24] * data[i-25];
729				case 24: sum += qlp_coeff[23] * data[i-24];
730				case 23: sum += qlp_coeff[22] * data[i-23];
731				case 22: sum += qlp_coeff[21] * data[i-22];
732				case 21: sum += qlp_coeff[20] * data[i-21];
733				case 20: sum += qlp_coeff[19] * data[i-20];
734				case 19: sum += qlp_coeff[18] * data[i-19];
735				case 18: sum += qlp_coeff[17] * data[i-18];
736				case 17: sum += qlp_coeff[16] * data[i-17];
737				case 16: sum += qlp_coeff[15] * data[i-16];
738				case 15: sum += qlp_coeff[14] * data[i-15];
739				case 14: sum += qlp_coeff[13] * data[i-14];
740				case 13: sum += qlp_coeff[12] * data[i-13];
741				         sum += qlp_coeff[11] * data[i-12];
742				         sum += qlp_coeff[10] * data[i-11];
743				         sum += qlp_coeff[ 9] * data[i-10];
744				         sum += qlp_coeff[ 8] * data[i- 9];
745				         sum += qlp_coeff[ 7] * data[i- 8];
746				         sum += qlp_coeff[ 6] * data[i- 7];
747				         sum += qlp_coeff[ 5] * data[i- 6];
748				         sum += qlp_coeff[ 4] * data[i- 5];
749				         sum += qlp_coeff[ 3] * data[i- 4];
750				         sum += qlp_coeff[ 2] * data[i- 3];
751				         sum += qlp_coeff[ 1] * data[i- 2];
752				         sum += qlp_coeff[ 0] * data[i- 1];
753			}
754			residual[i] = data[i] - (sum >> lp_quantization);
755		}
756	}
757	_mm256_zeroupper();
758}
759
760static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
761
762FLAC__SSE_TARGET("avx2")
763void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
764{
765	int i;
766	FLAC__int64 sum;
767	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
768	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
769
770	FLAC__ASSERT(order > 0);
771	FLAC__ASSERT(order <= 32);
772	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
773
774	if(order <= 12) {
775		if(order > 8) {
776			if(order > 10) {
777				if(order == 12) {
778					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
779					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
780					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
781					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
782					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
783					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
784					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
785					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
786					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
787					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
788					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
789					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
790					q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
791
792					for(i = 0; i < (int)data_len-3; i+=4) {
793						__m256i summ, mull;
794						summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
795						mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
796						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
797						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
798						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
799						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
800						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
801						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
802						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
803						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
804						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
805						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
806						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
807						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
808					}
809				}
810				else { /* order == 11 */
811					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
812					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
813					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
814					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
815					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
816					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
817					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
818					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
819					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
820					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
821					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
822					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
823
824					for(i = 0; i < (int)data_len-3; i+=4) {
825						__m256i summ, mull;
826						summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
827						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
828						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
829						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
830						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
831						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
832						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
833						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
834						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
835						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
836						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
837						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
838						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
839					}
840				}
841			}
842			else {
843				if(order == 10) {
844					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
845					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
846					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
847					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
848					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
849					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
850					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
851					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
852					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
853					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
854					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
855
856					for(i = 0; i < (int)data_len-3; i+=4) {
857						__m256i summ, mull;
858						summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
859						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
860						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
861						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
862						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
863						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
864						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
865						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
866						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
867						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
868						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
869						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
870					}
871				}
872				else { /* order == 9 */
873					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
874					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
875					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
876					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
877					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
878					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
879					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
880					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
881					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
882					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
883
884					for(i = 0; i < (int)data_len-3; i+=4) {
885						__m256i summ, mull;
886						summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
887						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
888						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
889						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
890						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
891						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
892						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
893						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
894						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
895						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
896						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
897					}
898				}
899			}
900		}
901		else if(order > 4) {
902			if(order > 6) {
903				if(order == 8) {
904					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
905					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
906					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
907					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
908					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
909					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
910					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
911					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
912					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
913
914					for(i = 0; i < (int)data_len-3; i+=4) {
915						__m256i summ, mull;
916						summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
917						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
918						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
919						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
920						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
921						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
922						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
923						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
924						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
925						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
926					}
927				}
928				else { /* order == 7 */
929					__m256i q0, q1, q2, q3, q4, q5, q6;
930					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
931					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
932					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
933					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
934					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
935					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
936					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
937
938					for(i = 0; i < (int)data_len-3; i+=4) {
939						__m256i summ, mull;
940						summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
941						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
942						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
943						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
944						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
945						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
946						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
947						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
948						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
949					}
950				}
951			}
952			else {
953				if(order == 6) {
954					__m256i q0, q1, q2, q3, q4, q5;
955					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
956					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
957					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
958					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
959					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
960					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
961
962					for(i = 0; i < (int)data_len-3; i+=4) {
963						__m256i summ, mull;
964						summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
965						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
966						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
967						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
968						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
969						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
970						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
971						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
972					}
973				}
974				else { /* order == 5 */
975					__m256i q0, q1, q2, q3, q4;
976					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
977					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
978					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
979					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
980					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
981
982					for(i = 0; i < (int)data_len-3; i+=4) {
983						__m256i summ, mull;
984						summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
985						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
986						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
987						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
988						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
989						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
990						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
991					}
992				}
993			}
994		}
995		else {
996			if(order > 2) {
997				if(order == 4) {
998					__m256i q0, q1, q2, q3;
999					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1000					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1001					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1002					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1003
1004					for(i = 0; i < (int)data_len-3; i+=4) {
1005						__m256i summ, mull;
1006						summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1007						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1008						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1009						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1010						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1011						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1012					}
1013				}
1014				else { /* order == 3 */
1015					__m256i q0, q1, q2;
1016					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1017					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1018					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1019
1020					for(i = 0; i < (int)data_len-3; i+=4) {
1021						__m256i summ, mull;
1022						summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1023						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1024						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1025						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1026						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1027					}
1028				}
1029			}
1030			else {
1031				if(order == 2) {
1032					__m256i q0, q1;
1033					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1034					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1035
1036					for(i = 0; i < (int)data_len-3; i+=4) {
1037						__m256i summ, mull;
1038						summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1039						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1040						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1041						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1042					}
1043				}
1044				else { /* order == 1 */
1045					__m256i q0;
1046					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1047
1048					for(i = 0; i < (int)data_len-3; i+=4) {
1049						__m256i summ;
1050						summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1051						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1052						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1053					}
1054				}
1055			}
1056		}
1057		for(; i < (int)data_len; i++) {
1058			sum = 0;
1059			switch(order) {
1060				case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1061				case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1062				case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1063				case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1064				case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1065				case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1066				case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1067				case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1068				case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1069				case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1070				case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1071				case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1072			}
1073			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1074		}
1075	}
1076	else { /* order > 12 */
1077		for(i = 0; i < (int)data_len; i++) {
1078			sum = 0;
1079			switch(order) {
1080				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1081				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1082				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1083				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1084				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1085				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1086				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1087				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1088				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1089				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1090				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1091				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1092				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1093				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1094				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1095				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1096				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1097				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1098				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1099				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1100				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1101				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1102				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1103				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1104				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1105				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1106				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1107				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1108				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1109				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1110				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1111				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1112			}
1113			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1114		}
1115	}
1116	_mm256_zeroupper();
1117}
1118
1119#endif /* FLAC__AVX2_SUPPORTED */
1120#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1121#endif /* FLAC__NO_ASM */
1122#endif /* FLAC__INTEGER_ONLY_LIBRARY */
1123