1ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes/* libFLAC - Free Lossless Audio Codec library 2ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2000-2009 Josh Coalson 3ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Copyright (C) 2011-2016 Xiph.Org Foundation 4ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 5ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * Redistribution and use in source and binary forms, with or without 6ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * modification, are permitted provided that the following conditions 7ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * are met: 8ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 9ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions of source code must retain the above copyright 10ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer. 11ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 12ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Redistributions in binary form must reproduce the above copyright 13ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * notice, this list of conditions and the following disclaimer in the 14ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * documentation and/or other materials provided with the distribution. 15ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 16ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * - Neither the name of the Xiph.org Foundation nor the names of its 17ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * contributors may be used to endorse or promote products derived from 18ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * this software without specific prior written permission. 19ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * 20ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 24ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes */ 32ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 33ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef HAVE_CONFIG_H 34ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes# include <config.h> 35ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif 36ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 37ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/cpu.h" 38ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 39ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__INTEGER_ONLY_LIBRARY 40ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifndef FLAC__NO_ASM 41ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN 42ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "private/lpc.h" 43ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#ifdef FLAC__AVX2_SUPPORTED 44ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 45ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/assert.h" 46ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include "FLAC/format.h" 47ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 48ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#include <immintrin.h> /* AVX2 */ 49ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 50ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2") 51ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 52ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 53ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes int i; 54ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 sum; 55ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 56ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 57ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order > 0); 58ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 32); 59ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 60ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order <= 12) { 61ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { 62ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 10) { 63ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 12) { 64ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 65ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 66ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 67ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 68ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 69ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 70ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 71ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 72ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 73ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 74ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 75ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); 76ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]); 77ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 78ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 79ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 80ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); 81ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); 82ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 83ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 84ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 85ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 86ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 87ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 88ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 89ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 90ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 91ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 92ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 93ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 94ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 95ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 96ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 11 */ 97ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 98ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 99ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 100ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 101ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 102ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 103ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 104ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 105ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 106ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 107ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 108ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]); 109ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 110ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 111ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 112ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); 113ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 114ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 115ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 116ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 117ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 118ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 119ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 120ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 121ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 122ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 123ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 124ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 125ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 126ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 127ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 128ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 129ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 10) { 130ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 131ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 132ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 133ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 134ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 135ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 136ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 137ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 138ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 139ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 140ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]); 141ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 142ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 143ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 144ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); 145ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull); 146ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 147ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 148ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 149ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 150ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 151ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 152ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 153ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 154ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 155ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 156ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 157ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 158ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9 */ 159ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 160ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 161ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 162ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 163ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 164ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 165ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 166ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 167ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 168ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]); 169ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 170ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 171ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 172ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); 173ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull); 174ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 175ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 176ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 177ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 178ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 179ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 180ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 181ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 182ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 183ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 184ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 185ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 186ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 187ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else if(order > 4) { 188ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 6) { 189ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 8) { 190ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7; 191ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 192ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 193ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 194ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 195ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 196ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 197ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 198ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]); 199ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 200ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 201ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 202ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); 203ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull); 204ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 205ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 206ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 207ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 208ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 209ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 210ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 211ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 212ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 213ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 214ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 7 */ 215ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6; 216ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 217ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 218ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 219ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 220ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 221ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 222ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]); 223ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 224ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 225ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 226ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); 227ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull); 228ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 229ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 230ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 231ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 232ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 233ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 234ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 235ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 236ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 237ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 238ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 239ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 6) { 240ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5; 241ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 242ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 243ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 244ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 245ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 246ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]); 247ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 248ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 249ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 250ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); 251ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull); 252ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 253ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 254ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 255ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 256ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 257ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 258ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 259ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 260ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5 */ 261ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4; 262ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 263ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 264ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 265ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 266ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]); 267ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 268ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 269ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 270ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); 271ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull); 272ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 273ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 274ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 275ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 276ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 277ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 278ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 279ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 280ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 281ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 282ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 2) { 283ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 4) { 284ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3; 285ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 286ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 287ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 288ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]); 289ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 290ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 291ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 292ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); 293ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull); 294ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 295ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 296ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 297ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 298ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 299ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 300ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 3 */ 301ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2; 302ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 303ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 304ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]); 305ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 306ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 307ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 308ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); 309ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull); 310ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 311ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 312ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 313ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 314ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 315ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 316ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 317ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 2) { 318ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1; 319ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 320ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]); 321ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 322ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 323ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 324ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); 325ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull); 326ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 327ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 328ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 329ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 330ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1 */ 331ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0; 332ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]); 333ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 334ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 335ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ; 336ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); 337ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 338ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 339ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 340ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 341ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 342ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 343ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(; i < (int)data_len; i++) { 344ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 345ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 346ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 12: sum += qlp_coeff[11] * data[i-12]; 347ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 11: sum += qlp_coeff[10] * data[i-11]; 348ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 10: sum += qlp_coeff[ 9] * data[i-10]; 349ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 9: sum += qlp_coeff[ 8] * data[i- 9]; 350ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 8: sum += qlp_coeff[ 7] * data[i- 8]; 351ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 7: sum += qlp_coeff[ 6] * data[i- 7]; 352ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 6: sum += qlp_coeff[ 5] * data[i- 6]; 353ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 5: sum += qlp_coeff[ 4] * data[i- 5]; 354ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 4: sum += qlp_coeff[ 3] * data[i- 4]; 355ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 3: sum += qlp_coeff[ 2] * data[i- 3]; 356ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 2: sum += qlp_coeff[ 1] * data[i- 2]; 357ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 1: sum += qlp_coeff[ 0] * data[i- 1]; 358ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 359ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 360ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 361ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 362ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order > 12 */ 363ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 364ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 365ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 366ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 32: sum += qlp_coeff[31] * data[i-32]; 367ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 31: sum += qlp_coeff[30] * data[i-31]; 368ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 30: sum += qlp_coeff[29] * data[i-30]; 369ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 29: sum += qlp_coeff[28] * data[i-29]; 370ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 28: sum += qlp_coeff[27] * data[i-28]; 371ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 27: sum += qlp_coeff[26] * data[i-27]; 372ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 26: sum += qlp_coeff[25] * data[i-26]; 373ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 25: sum += qlp_coeff[24] * data[i-25]; 374ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 24: sum += qlp_coeff[23] * data[i-24]; 375ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 23: sum += qlp_coeff[22] * data[i-23]; 376ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 22: sum += qlp_coeff[21] * data[i-22]; 377ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 21: sum += qlp_coeff[20] * data[i-21]; 378ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 20: sum += qlp_coeff[19] * data[i-20]; 379ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 19: sum += qlp_coeff[18] * data[i-19]; 380ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 18: sum += qlp_coeff[17] * data[i-18]; 381ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 17: sum += qlp_coeff[16] * data[i-17]; 382ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 16: sum += qlp_coeff[15] * data[i-16]; 383ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 15: sum += qlp_coeff[14] * data[i-15]; 384ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 14: sum += qlp_coeff[13] * data[i-14]; 385ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 13: sum += qlp_coeff[12] * data[i-13]; 386ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[11] * data[i-12]; 387ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[10] * data[i-11]; 388ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 9] * data[i-10]; 389ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 8] * data[i- 9]; 390ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 7] * data[i- 8]; 391ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 6] * data[i- 7]; 392ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 5] * data[i- 6]; 393ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 4] * data[i- 5]; 394ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 3] * data[i- 4]; 395ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 2] * data[i- 3]; 396ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 1] * data[i- 2]; 397ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 0] * data[i- 1]; 398ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 399ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 400ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 401ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 402ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_zeroupper(); 403ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 404ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 405ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2") 406ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 407ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 408ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes int i; 409ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int32 sum; 410ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 411ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 412ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order > 0); 413ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 32); 414ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 415ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order <= 12) { 416ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { 417ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 10) { 418ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 12) { 419ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 420ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 421ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 422ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 423ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 424ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 425ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 426ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 427ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 428ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 429ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 430ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_set1_epi32(qlp_coeff[10]); 431ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q11 = _mm256_set1_epi32(qlp_coeff[11]); 432ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 433ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 434ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 435ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12))); 436ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull); 437ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 438ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 439ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 440ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 441ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 442ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 443ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 444ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 445ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 446ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 447ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 448ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 449ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 450ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 451ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 11 */ 452ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 453ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 454ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 455ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 456ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 457ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 458ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 459ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 460ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 461ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 462ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 463ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_set1_epi32(qlp_coeff[10]); 464ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 465ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 466ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 467ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); 468ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull); 469ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 470ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 471ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 472ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 473ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 474ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 475ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 476ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 477ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 478ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 479ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 480ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 481ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 482ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 483ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 484ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 10) { 485ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 486ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 487ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 488ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 489ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 490ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 491ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 492ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 493ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 494ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 495ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_set1_epi32(qlp_coeff[9 ]); 496ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 497ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 498ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 499ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); 500ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull); 501ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 502ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 503ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 504ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 505ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 506ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 507ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 508ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 509ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 510ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 511ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 512ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 513ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9 */ 514ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 515ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 516ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 517ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 518ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 519ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 520ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 521ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 522ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 523ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_set1_epi32(qlp_coeff[8 ]); 524ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 525ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 526ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 527ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); 528ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull); 529ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 530ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 531ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 532ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 533ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 534ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 535ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 536ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 537ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 538ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 539ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 540ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 541ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 542ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else if(order > 4) { 543ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 6) { 544ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 8) { 545ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7; 546ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 547ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 548ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 549ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 550ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 551ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 552ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 553ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_set1_epi32(qlp_coeff[7 ]); 554ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 555ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 556ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 557ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); 558ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull); 559ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 560ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 561ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 562ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 563ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 564ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 565ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 566ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 567ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 568ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 569ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 7 */ 570ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6; 571ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 572ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 573ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 574ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 575ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 576ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 577ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_set1_epi32(qlp_coeff[6 ]); 578ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 579ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 580ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 581ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); 582ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull); 583ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 584ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 585ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 586ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 587ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 588ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 589ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 590ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 591ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 592ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 593ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 594ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 6) { 595ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5; 596ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 597ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 598ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 599ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 600ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 601ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_set1_epi32(qlp_coeff[5 ]); 602ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 603ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 604ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 605ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); 606ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull); 607ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 608ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 609ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 610ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 611ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 612ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 613ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 614ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 615ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5 */ 616ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4; 617ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 618ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 619ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 620ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 621ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_set1_epi32(qlp_coeff[4 ]); 622ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 623ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 624ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 625ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); 626ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull); 627ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 628ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 629ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 630ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 631ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 632ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 633ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 634ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 635ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 636ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 637ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 2) { 638ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 4) { 639ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3; 640ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 641ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 642ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 643ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_set1_epi32(qlp_coeff[3 ]); 644ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 645ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 646ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 647ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); 648ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull); 649ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 650ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 651ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 652ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 653ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 654ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 655ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 3 */ 656ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2; 657ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 658ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 659ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_set1_epi32(qlp_coeff[2 ]); 660ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 661ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 662ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 663ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); 664ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull); 665ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 666ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 667ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 668ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 669ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 670ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 671ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 672ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 2) { 673ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1; 674ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 675ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_set1_epi32(qlp_coeff[1 ]); 676ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 677ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 678ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 679ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); 680ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull); 681ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 682ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 683ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 684ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 685ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1 */ 686ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0; 687ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_set1_epi32(qlp_coeff[0 ]); 688ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 689ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-7; i+=8) { 690ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ; 691ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); 692ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_sra_epi32(summ, cnt); 693ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_storeu_si256((__m256i*)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ)); 694ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 695ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 696ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 697ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 698ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(; i < (int)data_len; i++) { 699ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 700ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 701ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 12: sum += qlp_coeff[11] * data[i-12]; 702ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 11: sum += qlp_coeff[10] * data[i-11]; 703ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 10: sum += qlp_coeff[ 9] * data[i-10]; 704ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 9: sum += qlp_coeff[ 8] * data[i- 9]; 705ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 8: sum += qlp_coeff[ 7] * data[i- 8]; 706ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 7: sum += qlp_coeff[ 6] * data[i- 7]; 707ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 6: sum += qlp_coeff[ 5] * data[i- 6]; 708ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 5: sum += qlp_coeff[ 4] * data[i- 5]; 709ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 4: sum += qlp_coeff[ 3] * data[i- 4]; 710ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 3: sum += qlp_coeff[ 2] * data[i- 3]; 711ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 2: sum += qlp_coeff[ 1] * data[i- 2]; 712ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 1: sum += qlp_coeff[ 0] * data[i- 1]; 713ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 714ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 715ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 716ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 717ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order > 12 */ 718ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 719ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 720ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 721ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 32: sum += qlp_coeff[31] * data[i-32]; 722ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 31: sum += qlp_coeff[30] * data[i-31]; 723ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 30: sum += qlp_coeff[29] * data[i-30]; 724ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 29: sum += qlp_coeff[28] * data[i-29]; 725ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 28: sum += qlp_coeff[27] * data[i-28]; 726ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 27: sum += qlp_coeff[26] * data[i-27]; 727ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 26: sum += qlp_coeff[25] * data[i-26]; 728ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 25: sum += qlp_coeff[24] * data[i-25]; 729ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 24: sum += qlp_coeff[23] * data[i-24]; 730ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 23: sum += qlp_coeff[22] * data[i-23]; 731ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 22: sum += qlp_coeff[21] * data[i-22]; 732ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 21: sum += qlp_coeff[20] * data[i-21]; 733ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 20: sum += qlp_coeff[19] * data[i-20]; 734ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 19: sum += qlp_coeff[18] * data[i-19]; 735ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 18: sum += qlp_coeff[17] * data[i-18]; 736ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 17: sum += qlp_coeff[16] * data[i-17]; 737ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 16: sum += qlp_coeff[15] * data[i-16]; 738ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 15: sum += qlp_coeff[14] * data[i-15]; 739ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 14: sum += qlp_coeff[13] * data[i-14]; 740ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 13: sum += qlp_coeff[12] * data[i-13]; 741ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[11] * data[i-12]; 742ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[10] * data[i-11]; 743ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 9] * data[i-10]; 744ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 8] * data[i- 9]; 745ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 7] * data[i- 8]; 746ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 6] * data[i- 7]; 747ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 5] * data[i- 6]; 748ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 4] * data[i- 5]; 749ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 3] * data[i- 4]; 750ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 2] * data[i- 3]; 751ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 1] * data[i- 2]; 752ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 0] * data[i- 1]; 753ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 754ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (sum >> lp_quantization); 755ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 756ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 757ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_zeroupper(); 758ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 759ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 760ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesstatic FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 }; 761ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 762ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott HughesFLAC__SSE_TARGET("avx2") 763ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughesvoid FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) 764ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes{ 765ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes int i; 766ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__int64 sum; 767ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m128i cnt = _mm_cvtsi32_si128(lp_quantization); 768ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr); 769ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 770ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order > 0); 771ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(order <= 32); 772ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */ 773ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 774ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order <= 12) { 775ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 8) { 776ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 10) { 777ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 12) { 778ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; 779ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 780ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 781ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 782ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 783ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 784ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 785ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 786ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 787ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 788ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 789ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); 790ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11])); 791ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 792ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 793ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 794ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12)))); 795ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull); 796ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); 797ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 798ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 799ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 800ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 801ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 802ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 803ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 804ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 805ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 806ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 807ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 808ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 809ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 810ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 11 */ 811ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10; 812ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 813ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 814ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 815ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 816ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 817ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 818ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 819ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 820ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 821ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 822ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10])); 823ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 824ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 825ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 826ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); 827ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull); 828ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 829ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 830ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 831ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 832ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 833ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 834ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 835ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 836ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 837ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 838ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 839ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 840ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 841ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 842ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 843ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 10) { 844ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9; 845ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 846ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 847ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 848ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 849ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 850ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 851ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 852ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 853ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 854ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ])); 855ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 856ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 857ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 858ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); 859ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull); 860ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 861ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 862ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 863ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 864ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 865ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 866ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 867ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 868ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 869ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 870ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 871ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 872ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 9 */ 873ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8; 874ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 875ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 876ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 877ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 878ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 879ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 880ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 881ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 882ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ])); 883ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 884ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 885ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 886ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); 887ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull); 888ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 889ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 890ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 891ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 892ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 893ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 894ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 895ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 896ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 897ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 898ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 899ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 900ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 901ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else if(order > 4) { 902ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 6) { 903ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 8) { 904ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6, q7; 905ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 906ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 907ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 908ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 909ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 910ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 911ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 912ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ])); 913ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 914ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 915ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 916ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); 917ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull); 918ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 919ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 920ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 921ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 922ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 923ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 924ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 925ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 926ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 927ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 928ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 7 */ 929ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5, q6; 930ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 931ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 932ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 933ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 934ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 935ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 936ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ])); 937ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 938ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 939ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 940ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); 941ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull); 942ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 943ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 944ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 945ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 946ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 947ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 948ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 949ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 950ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 951ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 952ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 953ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 6) { 954ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4, q5; 955ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 956ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 957ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 958ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 959ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 960ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ])); 961ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 962ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 963ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 964ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); 965ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull); 966ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 967ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 968ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 969ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 970ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 971ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 972ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 973ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 974ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 5 */ 975ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3, q4; 976ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 977ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 978ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 979ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 980ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ])); 981ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 982ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 983ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 984ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); 985ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull); 986ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 987ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 988ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 989ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 990ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 991ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 992ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 993ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 994ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 995ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 996ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order > 2) { 997ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 4) { 998ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2, q3; 999ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1000ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1001ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 1002ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ])); 1003ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1004ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 1005ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 1006ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); 1007ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull); 1008ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 1009ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1010ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1011ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1012ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1013ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1014ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 3 */ 1015ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1, q2; 1016ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1017ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1018ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ])); 1019ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1020ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 1021ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 1022ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); 1023ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull); 1024ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1025ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1026ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1027ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1028ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1029ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1030ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { 1031ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes if(order == 2) { 1032ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0, q1; 1033ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1034ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ])); 1035ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1036ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 1037ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ, mull; 1038ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); 1039ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull); 1040ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1041ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1042ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1043ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1044ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order == 1 */ 1045ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i q0; 1046ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ])); 1047ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1048ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len-3; i+=4) { 1049ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes __m256i summ; 1050ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); 1051ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack); 1052ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi256_si128(summ))); 1053ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1054ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1055ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1056ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1057ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(; i < (int)data_len; i++) { 1058ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 1059ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 1060ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 1061ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 1062ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 1063ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 9: sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 1064ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 8: sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 1065ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 7: sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 1066ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 6: sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 1067ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 5: sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 1068ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 4: sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 1069ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 3: sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 1070ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 2: sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 1071ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 1072ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1073ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1074ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1075ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1076ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes else { /* order > 12 */ 1077ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes for(i = 0; i < (int)data_len; i++) { 1078ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum = 0; 1079ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes switch(order) { 1080ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32]; 1081ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31]; 1082ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30]; 1083ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29]; 1084ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28]; 1085ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27]; 1086ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26]; 1087ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25]; 1088ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24]; 1089ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23]; 1090ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22]; 1091ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21]; 1092ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20]; 1093ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19]; 1094ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18]; 1095ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17]; 1096ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16]; 1097ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15]; 1098ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14]; 1099ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13]; 1100ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[11] * (FLAC__int64)data[i-12]; 1101ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[10] * (FLAC__int64)data[i-11]; 1102ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10]; 1103ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9]; 1104ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8]; 1105ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7]; 1106ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6]; 1107ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5]; 1108ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4]; 1109ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3]; 1110ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2]; 1111ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1]; 1112ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1113ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization); 1114ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1115ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes } 1116ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes _mm256_zeroupper(); 1117ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes} 1118ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes 1119ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__AVX2_SUPPORTED */ 1120ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */ 1121ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__NO_ASM */ 1122ae0e7bcc925f0624f6e34976984b40181c965fd9Elliott Hughes#endif /* FLAC__INTEGER_ONLY_LIBRARY */ 1123