1ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes/* libFLAC - Free Lossless Audio Codec library
2ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2000-2009  Josh Coalson
3ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2011-2016  Xiph.Org Foundation
4ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes *
5ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Redistribution and use in source and binary forms, with or without
6ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * modification, are permitted provided that the following conditions
7ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * are met:
8ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes *
9ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions of source code must retain the above copyright
10ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer.
11ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes *
12ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions in binary form must reproduce the above copyright
13ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer in the
14ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * documentation and/or other materials provided with the distribution.
15ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes *
16ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Neither the name of the Xiph.org Foundation nor the names of its
17ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * contributors may be used to endorse or promote products derived from
18ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * this software without specific prior written permission.
19ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes *
20ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes */
32ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
33ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef HAVE_CONFIG_H
34ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#  include <config.h>
35ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif
36ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
37ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/cpu.h"
38ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
39ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__INTEGER_ONLY_LIBRARY
40ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__NO_ASM
41ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/lpc.h"
43ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef FLAC__AVX2_SUPPORTED
44ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
45ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/assert.h"
46ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/format.h"
47ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
48ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include <immintrin.h> /* AVX2 */
49ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
50ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2")
51ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
52ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{
53ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	int i;
54ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__int32 sum;
55ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
56ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
57ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order > 0);
58ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order <= 32);
59ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
60ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	if(order <= 12) {
61ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		if(order > 8) {
62ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 10) {
63ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 12) {
64ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
65ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
66ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
67ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
68ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
69ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
70ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
71ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
72ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
73ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
74ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
75ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
76ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
77ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
78ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
79ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
80ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
81ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
82ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
83ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
84ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
85ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
86ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
87ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
88ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
89ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
90ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
91ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
92ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
93ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
94ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
95ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
96ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 11 */
97ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
98ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
99ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
100ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
101ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
102ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
103ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
104ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
105ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
106ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
107ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
108ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
109ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
110ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
111ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
112ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
113ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
114ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
115ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
116ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
117ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
118ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
119ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
120ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
121ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
122ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
123ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
124ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
125ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
126ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
127ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
128ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
129ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 10) {
130ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
131ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
132ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
133ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
134ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
135ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
136ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
137ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
138ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
139ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
140ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
141ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
142ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
143ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
144ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
145ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
146ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
147ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
148ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
149ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
150ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
151ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
152ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
153ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
154ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
155ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
156ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
157ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
158ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 9 */
159ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
160ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
161ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
162ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
163ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
164ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
165ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
166ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
167ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
168ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
169ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
170ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
171ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
172ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9 )));
173ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
174ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
175ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
176ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
177ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
178ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
179ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
180ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
181ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
182ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
183ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
184ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
185ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
186ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
187ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else if(order > 4) {
188ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 6) {
189ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 8) {
190ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
191ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
192ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
193ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
194ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
195ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
196ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
197ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
198ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
199ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
200ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
201ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
202ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8 )));
203ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
204ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
205ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
206ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
207ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
208ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
209ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
210ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
211ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
212ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
213ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
214ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 7 */
215ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6;
216ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
217ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
218ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
219ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
220ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
221ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
222ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
223ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
224ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
225ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
226ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7 )));
227ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
228ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
229ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
230ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
231ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
232ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
233ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
234ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
235ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
236ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
237ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
238ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
239ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 6) {
240ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5;
241ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
242ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
243ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
244ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
245ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
246ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
247ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
248ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
249ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
250ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6 )));
251ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
252ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
253ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
254ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
255ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
256ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
257ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
258ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
259ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
260ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 5 */
261ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4;
262ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
263ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
264ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
265ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
266ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
267ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
268ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
269ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
270ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5 )));
271ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
272ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
273ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
274ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
275ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
276ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
277ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
278ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
279ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
280ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
281ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else {
282ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 2) {
283ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 4) {
284ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3;
285ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
286ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
287ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
288ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
289ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
290ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
291ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
292ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4 )));
293ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
294ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
295ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
296ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
297ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
298ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
299ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
300ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 3 */
301ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2;
302ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
303ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
304ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
305ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
306ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
307ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
308ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3 )));
309ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
310ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
311ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
312ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
313ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
314ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
315ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
316ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
317ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 2) {
318ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1;
319ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
320ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
321ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
322ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
323ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
324ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2 )));
325ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
326ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
327ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
328ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
329ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
330ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 1 */
331ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0;
332ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
333ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
334ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
335ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ;
336ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_madd_epi16(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1 )));
337ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
338ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
339ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
340ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
341ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
342ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
343ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(; i < (int)data_len; i++) {
344ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
345ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
346ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 12: sum += qlp_coeff[11] * data[i-12];
347ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 11: sum += qlp_coeff[10] * data[i-11];
348ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 10: sum += qlp_coeff[ 9] * data[i-10];
349ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
350ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
351ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
352ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
353ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
354ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
355ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
356ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
357ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
358ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
359ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (sum >> lp_quantization);
360ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
361ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
362ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	else { /* order > 12 */
363ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(i = 0; i < (int)data_len; i++) {
364ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
365ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
366ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 32: sum += qlp_coeff[31] * data[i-32];
367ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 31: sum += qlp_coeff[30] * data[i-31];
368ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 30: sum += qlp_coeff[29] * data[i-30];
369ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 29: sum += qlp_coeff[28] * data[i-29];
370ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 28: sum += qlp_coeff[27] * data[i-28];
371ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 27: sum += qlp_coeff[26] * data[i-27];
372ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 26: sum += qlp_coeff[25] * data[i-26];
373ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 25: sum += qlp_coeff[24] * data[i-25];
374ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 24: sum += qlp_coeff[23] * data[i-24];
375ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 23: sum += qlp_coeff[22] * data[i-23];
376ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 22: sum += qlp_coeff[21] * data[i-22];
377ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 21: sum += qlp_coeff[20] * data[i-21];
378ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 20: sum += qlp_coeff[19] * data[i-20];
379ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 19: sum += qlp_coeff[18] * data[i-19];
380ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 18: sum += qlp_coeff[17] * data[i-18];
381ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 17: sum += qlp_coeff[16] * data[i-17];
382ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 16: sum += qlp_coeff[15] * data[i-16];
383ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 15: sum += qlp_coeff[14] * data[i-15];
384ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 14: sum += qlp_coeff[13] * data[i-14];
385ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 13: sum += qlp_coeff[12] * data[i-13];
386ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[11] * data[i-12];
387ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[10] * data[i-11];
388ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 9] * data[i-10];
389ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 8] * data[i- 9];
390ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 7] * data[i- 8];
391ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 6] * data[i- 7];
392ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 5] * data[i- 6];
393ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 4] * data[i- 5];
394ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 3] * data[i- 4];
395ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 2] * data[i- 3];
396ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 1] * data[i- 2];
397ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 0] * data[i- 1];
398ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
399ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (sum >> lp_quantization);
400ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
401ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
402ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	_mm256_zeroupper();
403ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes}
404ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
405ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2")
406ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
407ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{
408ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	int i;
409ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__int32 sum;
410ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
411ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
412ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order > 0);
413ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order <= 32);
414ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
415ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	if(order <= 12) {
416ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		if(order > 8) {
417ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 10) {
418ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 12) {
419ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
420ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
421ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
422ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
423ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
424ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
425ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
426ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
427ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
428ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
429ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
430ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_set1_epi32(qlp_coeff[10]);
431ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q11 = _mm256_set1_epi32(qlp_coeff[11]);
432ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
433ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
434ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
435ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
436ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
437ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
438ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
439ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
440ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
441ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
442ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
443ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
444ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
445ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
446ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
447ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
448ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
449ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
450ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
451ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 11 */
452ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
453ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
454ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
455ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
456ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
457ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
458ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
459ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
460ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
461ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
462ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
463ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_set1_epi32(qlp_coeff[10]);
464ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
465ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
466ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
467ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
468ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
469ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
470ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
471ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
472ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
473ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
474ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
475ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
476ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
477ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
478ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
479ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
480ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
481ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
482ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
483ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
484ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 10) {
485ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
486ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
487ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
488ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
489ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
490ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
491ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
492ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
493ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
494ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
495ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_set1_epi32(qlp_coeff[9 ]);
496ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
497ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
498ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
499ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q9,  _mm256_loadu_si256((const __m256i*)(data+i-10)));
500ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));  summ = _mm256_add_epi32(summ, mull);
501ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
502ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
503ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
504ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
505ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
506ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
507ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
508ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
509ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
510ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
511ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
512ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
513ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 9 */
514ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
515ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
516ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
517ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
518ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
519ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
520ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
521ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
522ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
523ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_set1_epi32(qlp_coeff[8 ]);
524ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
525ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
526ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
527ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q8,  _mm256_loadu_si256((const __m256i*)(data+i-9)));
528ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));  summ = _mm256_add_epi32(summ, mull);
529ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
530ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
531ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
532ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
533ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
534ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
535ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
536ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
537ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
538ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
539ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
540ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
541ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
542ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else if(order > 4) {
543ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 6) {
544ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 8) {
545ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
546ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
547ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
548ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
549ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
550ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
551ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
552ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
553ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_set1_epi32(qlp_coeff[7 ]);
554ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
555ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
556ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
557ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q7,  _mm256_loadu_si256((const __m256i*)(data+i-8)));
558ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));  summ = _mm256_add_epi32(summ, mull);
559ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
560ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
561ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
562ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
563ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
564ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
565ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
566ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
567ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
568ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
569ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 7 */
570ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6;
571ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
572ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
573ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
574ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
575ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
576ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
577ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_set1_epi32(qlp_coeff[6 ]);
578ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
579ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
580ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
581ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q6,  _mm256_loadu_si256((const __m256i*)(data+i-7)));
582ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));  summ = _mm256_add_epi32(summ, mull);
583ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
584ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
585ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
586ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
587ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
588ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
589ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
590ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
591ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
592ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
593ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
594ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 6) {
595ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5;
596ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
597ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
598ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
599ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
600ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
601ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_set1_epi32(qlp_coeff[5 ]);
602ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
603ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
604ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
605ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q5,  _mm256_loadu_si256((const __m256i*)(data+i-6)));
606ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));  summ = _mm256_add_epi32(summ, mull);
607ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
608ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
609ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
610ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
611ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
612ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
613ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
614ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
615ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 5 */
616ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4;
617ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
618ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
619ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
620ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
621ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_set1_epi32(qlp_coeff[4 ]);
622ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
623ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
624ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
625ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q4,  _mm256_loadu_si256((const __m256i*)(data+i-5)));
626ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));  summ = _mm256_add_epi32(summ, mull);
627ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
628ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
629ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
630ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
631ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
632ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
633ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
634ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
635ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
636ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else {
637ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 2) {
638ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 4) {
639ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3;
640ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
641ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
642ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
643ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_set1_epi32(qlp_coeff[3 ]);
644ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
645ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
646ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
647ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q3,  _mm256_loadu_si256((const __m256i*)(data+i-4)));
648ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));  summ = _mm256_add_epi32(summ, mull);
649ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
650ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
651ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
652ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
653ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
654ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
655ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 3 */
656ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2;
657ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
658ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
659ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_set1_epi32(qlp_coeff[2 ]);
660ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
661ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
662ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
663ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q2,  _mm256_loadu_si256((const __m256i*)(data+i-3)));
664ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));  summ = _mm256_add_epi32(summ, mull);
665ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
666ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
667ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
668ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
669ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
670ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
671ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
672ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 2) {
673ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1;
674ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
675ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_set1_epi32(qlp_coeff[1 ]);
676ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
677ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
678ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
679ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q1,  _mm256_loadu_si256((const __m256i*)(data+i-2)));
680ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));  summ = _mm256_add_epi32(summ, mull);
681ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
682ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
683ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
684ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
685ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 1 */
686ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0;
687ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_set1_epi32(qlp_coeff[0 ]);
688ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
689ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-7; i+=8) {
690ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ;
691ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mullo_epi32(q0,  _mm256_loadu_si256((const __m256i*)(data+i-1)));
692ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_sra_epi32(summ, cnt);
693ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ));
694ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
695ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
696ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
697ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
698ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(; i < (int)data_len; i++) {
699ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
700ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
701ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 12: sum += qlp_coeff[11] * data[i-12];
702ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 11: sum += qlp_coeff[10] * data[i-11];
703ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 10: sum += qlp_coeff[ 9] * data[i-10];
704ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 9:  sum += qlp_coeff[ 8] * data[i- 9];
705ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 8:  sum += qlp_coeff[ 7] * data[i- 8];
706ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 7:  sum += qlp_coeff[ 6] * data[i- 7];
707ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 6:  sum += qlp_coeff[ 5] * data[i- 6];
708ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 5:  sum += qlp_coeff[ 4] * data[i- 5];
709ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 4:  sum += qlp_coeff[ 3] * data[i- 4];
710ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 3:  sum += qlp_coeff[ 2] * data[i- 3];
711ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 2:  sum += qlp_coeff[ 1] * data[i- 2];
712ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 1:  sum += qlp_coeff[ 0] * data[i- 1];
713ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
714ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (sum >> lp_quantization);
715ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
716ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
717ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	else { /* order > 12 */
718ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(i = 0; i < (int)data_len; i++) {
719ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
720ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
721ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 32: sum += qlp_coeff[31] * data[i-32];
722ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 31: sum += qlp_coeff[30] * data[i-31];
723ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 30: sum += qlp_coeff[29] * data[i-30];
724ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 29: sum += qlp_coeff[28] * data[i-29];
725ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 28: sum += qlp_coeff[27] * data[i-28];
726ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 27: sum += qlp_coeff[26] * data[i-27];
727ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 26: sum += qlp_coeff[25] * data[i-26];
728ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 25: sum += qlp_coeff[24] * data[i-25];
729ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 24: sum += qlp_coeff[23] * data[i-24];
730ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 23: sum += qlp_coeff[22] * data[i-23];
731ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 22: sum += qlp_coeff[21] * data[i-22];
732ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 21: sum += qlp_coeff[20] * data[i-21];
733ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 20: sum += qlp_coeff[19] * data[i-20];
734ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 19: sum += qlp_coeff[18] * data[i-19];
735ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 18: sum += qlp_coeff[17] * data[i-18];
736ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 17: sum += qlp_coeff[16] * data[i-17];
737ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 16: sum += qlp_coeff[15] * data[i-16];
738ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 15: sum += qlp_coeff[14] * data[i-15];
739ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 14: sum += qlp_coeff[13] * data[i-14];
740ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 13: sum += qlp_coeff[12] * data[i-13];
741ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[11] * data[i-12];
742ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[10] * data[i-11];
743ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 9] * data[i-10];
744ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 8] * data[i- 9];
745ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 7] * data[i- 8];
746ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 6] * data[i- 7];
747ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 5] * data[i- 6];
748ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 4] * data[i- 5];
749ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 3] * data[i- 4];
750ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 2] * data[i- 3];
751ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 1] * data[i- 2];
752ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 0] * data[i- 1];
753ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
754ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (sum >> lp_quantization);
755ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
756ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
757ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	_mm256_zeroupper();
758ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes}
759ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
760ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesstatic FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
761ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
762ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2")
763ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
764ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{
765ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	int i;
766ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__int64 sum;
767ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
768ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
769ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
770ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order > 0);
771ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(order <= 32);
772ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
773ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
774ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	if(order <= 12) {
775ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		if(order > 8) {
776ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 10) {
777ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 12) {
778ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
779ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
780ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
781ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
782ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
783ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
784ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
785ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
786ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
787ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
788ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
789ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
790ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
791ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
792ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
793ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
794ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
795ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
796ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
797ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
798ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
799ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
800ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
801ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
802ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
803ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
804ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
805ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
806ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
807ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
808ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
809ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
810ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 11 */
811ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
812ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
813ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
814ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
815ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
816ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
817ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
818ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
819ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
820ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
821ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
822ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
823ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
824ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
825ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
826ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
827ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
828ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
829ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
830ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
831ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
832ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
833ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
834ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
835ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
836ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
837ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
838ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
839ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
840ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
841ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
842ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
843ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 10) {
844ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
845ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
846ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
847ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
848ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
849ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
850ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
851ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
852ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
853ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
854ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q9  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
855ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
856ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
857ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
858ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q9,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
859ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
860ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
861ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
862ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
863ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
864ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
865ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
866ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
867ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
868ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
869ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
870ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
871ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
872ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 9 */
873ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
874ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
875ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
876ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
877ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
878ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
879ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
880ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
881ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
882ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q8  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
883ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
884ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
885ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
886ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q8,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
887ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
888ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
889ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
890ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
891ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
892ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
893ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
894ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
895ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
896ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
897ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
898ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
899ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
900ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
901ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else if(order > 4) {
902ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 6) {
903ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 8) {
904ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6, q7;
905ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
906ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
907ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
908ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
909ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
910ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
911ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
912ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q7  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
913ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
914ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
915ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
916ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q7,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
917ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
918ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
919ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
920ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
921ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
922ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
923ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
924ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
925ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
926ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
927ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
928ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 7 */
929ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5, q6;
930ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
931ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
932ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
933ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
934ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
935ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
936ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q6  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
937ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
938ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
939ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
940ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q6,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
941ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
942ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
943ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
944ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
945ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
946ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
947ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
948ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
949ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
950ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
951ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
952ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
953ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 6) {
954ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4, q5;
955ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
956ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
957ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
958ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
959ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
960ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q5  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
961ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
962ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
963ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
964ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q5,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
965ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
966ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
967ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
968ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
969ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
970ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
971ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
972ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
973ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
974ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 5 */
975ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3, q4;
976ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
977ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
978ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
979ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
980ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q4  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
981ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
982ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
983ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
984ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q4,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
985ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
986ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
987ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
988ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
989ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
990ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
991ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
992ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
993ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
994ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
995ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		else {
996ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			if(order > 2) {
997ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 4) {
998ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2, q3;
999ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1000ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1001ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1002ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q3  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
1003ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
1004ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
1005ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
1006ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q3,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1007ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
1008ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1009ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1010ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1011ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1012ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
1013ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
1014ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 3 */
1015ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1, q2;
1016ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1017ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1018ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q2  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
1019ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
1020ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
1021ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
1022ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q2,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1023ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
1024ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1025ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1026ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1027ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
1028ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
1029ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
1030ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			else {
1031ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				if(order == 2) {
1032ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0, q1;
1033ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1034ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q1  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
1035ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
1036ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
1037ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ, mull;
1038ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q1,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1039ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						mull = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
1040ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1041ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1042ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
1043ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
1044ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				else { /* order == 1 */
1045ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					__m256i q0;
1046ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					q0  = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
1047ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
1048ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					for(i = 0; i < (int)data_len-3; i+=4) {
1049ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						__m256i summ;
1050ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_mul_epi32(q0,  _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1051ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
1052ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes						_mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ)));
1053ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes					}
1054ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				}
1055ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
1056ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
1057ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(; i < (int)data_len; i++) {
1058ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
1059ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
1060ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1061ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1062ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1063ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 9:  sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1064ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 8:  sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1065ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 7:  sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1066ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 6:  sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1067ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 5:  sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1068ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 4:  sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1069ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 3:  sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1070ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 2:  sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1071ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 1:  sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1072ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
1073ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1074ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
1075ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
1076ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	else { /* order > 12 */
1077ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		for(i = 0; i < (int)data_len; i++) {
1078ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			sum = 0;
1079ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			switch(order) {
1080ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1081ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1082ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1083ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1084ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1085ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1086ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1087ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1088ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1089ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1090ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1091ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1092ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1093ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1094ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1095ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1096ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1097ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1098ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1099ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1100ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1101ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1102ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1103ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1104ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1105ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1106ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1107ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1108ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1109ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1110ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1111ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes				         sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1112ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			}
1113ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes			residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
1114ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes		}
1115ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	}
1116ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes	_mm256_zeroupper();
1117ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes}
1118ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes
1119ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__AVX2_SUPPORTED */
1120ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1121ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__NO_ASM */
1122ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__INTEGER_ONLY_LIBRARY */
1123