176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org/* 276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * 476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * Use of this source code is governed by a BSD-style license 576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * that can be found in the LICENSE file in the root of the source 676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * tree. An additional intellectual property rights grant can be found 776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * in the file PATENTS. All contributing project authors may 876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org * be found in the AUTHORS file in the root of the source tree. 976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org */ 1076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 1176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org#include <immintrin.h> 1276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org#include "vpx_ports/mem.h" 1376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 1476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org// filters for 16_h8 and 16_v8 15411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { 1676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 17411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 18411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 1976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 20411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { 2176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 22411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 23411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 2476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 25411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { 2676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 27411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 28411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 2976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 30411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { 3176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 32411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 33411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}; 3476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 3593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#if defined(__clang__) 36693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \ 37693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0) 3893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) \ 3993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org _mm_broadcastsi128_si256((__m128i const *)&(x)) 40693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com# else // clang > 3.3, and not 5.0 on macosx. 4193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 4293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# endif // clang <= 3.3 4393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#elif defined(__GNUC__) 4493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) 4593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) \ 4693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org _mm_broadcastsi128_si256((__m128i const *)&(x)) 4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) 4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# else // gcc > 4.7 5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# endif // gcc <= 4.6 5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#else // !(gcc || clang) 5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) 5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#endif // __clang__ 5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org 5676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, 5776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int src_pixels_per_line, 5876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned char *output_ptr, 5976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int output_pitch, 6076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int output_height, 6176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org int16_t *filter) { 6276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i filtersReg; 6376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; 6476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 6576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; 6676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i srcReg32b1, srcReg32b2, filtersReg32; 6776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int i; 6876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int src_stride, dst_stride; 6976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 7076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 7176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 7276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 7376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the same data 7476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // in both lanes of 128 bit register. 7576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 7676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // have the same data in both lanes of a 256 bit register 7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 7876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 7976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the first 16 bits (first and second byte) 8076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 8176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org firstFilters = _mm256_shuffle_epi8(filtersReg32, 8276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x100u)); 8376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the second 16 bits (third and forth byte) 8476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 8576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org secondFilters = _mm256_shuffle_epi8(filtersReg32, 8676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x302u)); 8776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the third 16 bits (fifth and sixth byte) 8876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 8976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org thirdFilters = _mm256_shuffle_epi8(filtersReg32, 9076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x504u)); 9176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the forth 16 bits (seventh and eighth byte) 9276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 9376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org forthFilters = _mm256_shuffle_epi8(filtersReg32, 9476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x706u)); 9576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 9676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); 9776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); 9876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); 9976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); 10076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 10176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiple the size of the source and destination stride by two 10276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org src_stride = src_pixels_per_line << 1; 10376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org dst_stride = output_pitch << 1; 10476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org for (i = output_height; i > 1; i-=2) { 10576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // load the 2 strides of source 10676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_castsi128_si256( 10776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr-3))); 10876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 10976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *) 11076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org (src_ptr+src_pixels_per_line-3)), 1); 11176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 11276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 11376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); 11488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); 11576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 11676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 11776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); 11888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 11976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 12076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 12176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); 12276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 12376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 12488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); 12576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); 12676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 12776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 12888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 12976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 13076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 13176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 13276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 13376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 13476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 13576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // reading 2 strides of the next 16 bytes 13676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // (part of it was being read by earlier read) 13776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = _mm256_castsi128_si256( 13876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+5))); 13976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 14076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *) 14176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org (src_ptr+src_pixels_per_line+5)), 1); 14276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 14376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 14476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, 14576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 14676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 14776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 14876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); 14988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); 15076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 15176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 15276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); 15388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); 15476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 15576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 15676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); 15776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 15876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 15988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); 16076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); 16176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 16276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 16388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); 16476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); 16576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 16676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 16776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 16876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); 16976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, 17076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); 17176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 17276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 17376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); 17476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 17576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); 17676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 17776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shift by 7 bit each 16 bit 17876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); 17976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); 18076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 18176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 18276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // convolve result and the second lane contain the second convolve 18376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // result 18476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, 18576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt32b2_1); 18676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 18776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org src_ptr+=src_stride; 18876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 18976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 16 bytes 19076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)output_ptr, 19176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcRegFilt32b1_1)); 19276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 19376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save the next 16 bits 19476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)(output_ptr+output_pitch), 19576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); 19676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org output_ptr+=dst_stride; 19776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org } 19876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 19976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // if the number of strides is odd. 20076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // process only 16 bytes 20176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org if (i > 0) { 20276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; 20376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i srcRegFilt2, srcRegFilt3; 20476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 20576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); 20676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 20776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 20876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, 20976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(filt1Reg)); 21076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_shuffle_epi8(srcReg1, 21188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(filt4Reg)); 21276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 21376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 21476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, 21576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(firstFilters)); 21676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 21788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(forthFilters)); 21876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 21976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 22076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); 22176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 22276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 22376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3= _mm_shuffle_epi8(srcReg1, 22488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(filt2Reg)); 22576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2= _mm_shuffle_epi8(srcReg1, 22676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(filt3Reg)); 22776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 22876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 22976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 23088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(secondFilters)); 23176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 23276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(thirdFilters)); 23376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 23476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 23576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 23676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 23776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 23876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // reading the next 16 bytes 23976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // (part of it was being read by earlier read) 24076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); 24176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 24276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 24376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 24476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 24576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 24676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 24776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, 24876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(filt1Reg)); 24976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 25088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(filt4Reg)); 25176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 25276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 25376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, 25476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(firstFilters)); 25576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 25688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(forthFilters)); 25776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 25876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 25976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); 26076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 26176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // filter the source buffer 26276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_shuffle_epi8(srcReg2, 26388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(filt2Reg)); 26476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_shuffle_epi8(srcReg2, 26576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(filt3Reg)); 26676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 26776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 26876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, 26988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org _mm256_castsi256_si128(secondFilters)); 27076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, 27176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(thirdFilters)); 27276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 27376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 27476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 27576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_min_epi16(srcRegFilt3, srcRegFilt2)); 27676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 27776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_max_epi16(srcRegFilt3, srcRegFilt2)); 27876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 27976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 28076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, 28176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(addFilterReg64)); 28276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 28376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, 28476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(addFilterReg64)); 28576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 28676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shift by 7 bit each 16 bit 28776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); 28876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); 28976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 29076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 29176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // convolve result and the second lane contain the second convolve 29276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // result 29376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); 29476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 29576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 16 bytes 29676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); 29776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org } 29876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org} 29976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 30076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr, 30176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int src_pitch, 30276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned char *output_ptr, 30376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int out_pitch, 30476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int output_height, 30576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org int16_t *filter) { 30676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i filtersReg; 30776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i addFilterReg64; 30876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; 30976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; 310ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org __m256i srcReg32b11, srcReg32b12, filtersReg32; 31176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m256i firstFilters, secondFilters, thirdFilters, forthFilters; 31276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int i; 31376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org unsigned int src_stride, dst_stride; 31476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 31576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 31676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); 31776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filtersReg = _mm_loadu_si128((__m128i *)filter); 31876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // converting the 16 bit (short) to 8 bit (byte) and have the 31976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // same data in both lanes of 128 bit register. 32076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org filtersReg =_mm_packs_epi16(filtersReg, filtersReg); 32176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // have the same data in both lanes of a 256 bit register 32293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); 32376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 32476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the first 16 bits (first and second byte) 32576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 32676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org firstFilters = _mm256_shuffle_epi8(filtersReg32, 32776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x100u)); 32876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the second 16 bits (third and forth byte) 32976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 33076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org secondFilters = _mm256_shuffle_epi8(filtersReg32, 33176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x302u)); 33276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the third 16 bits (fifth and sixth byte) 33376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 33476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org thirdFilters = _mm256_shuffle_epi8(filtersReg32, 33576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x504u)); 33676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // duplicate only the forth 16 bits (seventh and eighth byte) 33776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // across 256 bit register 33876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org forthFilters = _mm256_shuffle_epi8(filtersReg32, 33976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_set1_epi16(0x706u)); 34076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 34176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiple the size of the source and destination stride by two 34276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org src_stride = src_pitch << 1; 34376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org dst_stride = out_pitch << 1; 34476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 34576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // load 16 bytes 7 times in stride of src_pitch 34676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_castsi128_si256( 34776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr))); 34876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = _mm256_castsi128_si256( 34976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch))); 35076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b3 = _mm256_castsi128_si256( 35176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2))); 35276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b4 = _mm256_castsi128_si256( 35376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3))); 35476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b5 = _mm256_castsi128_si256( 35576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4))); 35676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b6 = _mm256_castsi128_si256( 35776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5))); 35876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b7 = _mm256_castsi128_si256( 35976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6))); 36076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 36176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // have each consecutive loads on the same 256 register 36276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, 36376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b2), 1); 36476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, 36576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b3), 1); 36676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, 36776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b4), 1); 36876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, 36976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b5), 1); 37076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, 37176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b6), 1); 37276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, 37376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b7), 1); 37476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 37576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // merge every two consecutive registers except the last one 37676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); 37776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); 37876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 37976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 38076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); 38176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 38276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 38376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); 38476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 38576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 38676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); 38776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 38876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 38976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); 39076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 39176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 39276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org for (i = output_height; i > 1; i-=2) { 39376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // load the last 2 loads of 16 bytes and have every two 39476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // consecutive loads in the same 256 bit register 39576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b8 = _mm256_castsi128_si256( 39676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7))); 39776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, 39876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b8), 1); 39976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b9 = _mm256_castsi128_si256( 40076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8))); 40176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, 40276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b9), 1); 40376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 40476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // merge every two consecutive registers 40576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 40676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); 40776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); 40876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 40976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 41076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); 41176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); 41276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 41376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 41476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); 41576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 41676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 41776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); 41876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); 41976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 42076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 42176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 42276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_min_epi16(srcReg32b8, srcReg32b12)); 42376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_adds_epi16(srcReg32b10, 42476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_max_epi16(srcReg32b8, srcReg32b12)); 42576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 426ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 427ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); 428ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); 429ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org 430ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); 431ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org 432ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 433ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); 434ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); 435ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org 436ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org // add and saturate the results together 437ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 438ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org _mm256_min_epi16(srcReg32b8, srcReg32b12)); 439ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org srcReg32b1 = _mm256_adds_epi16(srcReg32b1, 440ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org _mm256_max_epi16(srcReg32b8, srcReg32b12)); 44176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 44276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); 44376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); 44476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 44576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shift by 7 bit each 16 bit 44676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); 44776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); 44876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 44976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 45076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // convolve result and the second lane contain the second convolve 45176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // result 45276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); 45376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 45476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org src_ptr+=src_stride; 45576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 45676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 16 bytes 45776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)output_ptr, 45876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b1)); 45976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 46076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save the next 16 bits 46176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)(output_ptr+out_pitch), 46276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_extractf128_si256(srcReg32b1, 1)); 46376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 46476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org output_ptr+=dst_stride; 46576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 46676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save part of the registers for next strides 46776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b10 = srcReg32b11; 46876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b1 = srcReg32b3; 46976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b11 = srcReg32b2; 47076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b3 = srcReg32b5; 47176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b2 = srcReg32b4; 47276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b5 = srcReg32b7; 47376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcReg32b7 = srcReg32b9; 47476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org } 47576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org if (i > 0) { 47676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; 47776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; 47876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // load the last 16 bytes 47976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); 48076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 48176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // merge the last 2 results together 48276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt4 = _mm_unpacklo_epi8( 48376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 48476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt7 = _mm_unpackhi_epi8( 48576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); 48676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 48776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 48876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), 48976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(firstFilters)); 49076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, 49176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(forthFilters)); 49276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), 49376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(firstFilters)); 49476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, 49576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(forthFilters)); 49676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 49776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 49876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); 49976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); 50076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 50176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 50276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 50376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), 50476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(secondFilters)); 50576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), 50676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(secondFilters)); 50776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 50876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // multiply 2 adjacent elements with the filter and add the result 50976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), 51076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(thirdFilters)); 51176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), 51276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(thirdFilters)); 51376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 51476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 51576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 51676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_min_epi16(srcRegFilt4, srcRegFilt6)); 51776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 51876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_min_epi16(srcRegFilt5, srcRegFilt7)); 51976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 52076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // add and saturate the results together 52176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 52276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_max_epi16(srcRegFilt4, srcRegFilt6)); 52376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 52476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_max_epi16(srcRegFilt5, srcRegFilt7)); 52576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 52676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 52776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, 52876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(addFilterReg64)); 52976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, 53076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm256_castsi256_si128(addFilterReg64)); 53176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 53276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shift by 7 bit each 16 bit 53376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); 53476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); 53576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 53676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // shrink to 8 bit each 16 bits, the first lane contain the first 53776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // convolve result and the second lane contain the second convolve 53876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // result 53976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); 54076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org 54176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org // save 16 bytes 54276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); 54376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org } 54476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org} 545