176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org/*
276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *
476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org */
1076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
1176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org#include <immintrin.h>
1276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org#include "vpx_ports/mem.h"
1376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
1476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org// filters for 16_h8 and 16_v8
15411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
1676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
17411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
18411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
1976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
20411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
2176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
22411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
23411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
2476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
25411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
2676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
27411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
28411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
2976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
30411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgDECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
3176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
32411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
33411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org};
3476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
3593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#if defined(__clang__)
36693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
37693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com      (defined(__APPLE__) && __clang_major__ == 5 && __clang_minor__ == 0)
3893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#  define MM256_BROADCASTSI128_SI256(x) \
3993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org       _mm_broadcastsi128_si256((__m128i const *)&(x))
40693441efe611de7ca09c00f4e79776f604b689f4joeyparrish@google.com# else  // clang > 3.3, and not 5.0 on macosx.
4193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
4293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# endif  // clang <= 3.3
4393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#elif defined(__GNUC__)
4493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
4593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#  define MM256_BROADCASTSI128_SI256(x) \
4693a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org       _mm_broadcastsi128_si256((__m128i const *)&(x))
4793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
4893a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#  define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
4993a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# else  // gcc > 4.7
5093a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#  define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
5193a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# endif  // gcc <= 4.6
5293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#else  // !(gcc || clang)
5393a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
5493a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org#endif  // __clang__
5593a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org
5676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
5776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int src_pixels_per_line,
5876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned char *output_ptr,
5976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int  output_pitch,
6076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int  output_height,
6176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  int16_t *filter) {
6276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m128i filtersReg;
6376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
6476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
6576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
6676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i srcReg32b1, srcReg32b2, filtersReg32;
6776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  unsigned int i;
6876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  unsigned int src_stride, dst_stride;
6976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
7076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
7176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
7276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
7376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // converting the 16 bit (short) to 8 bit (byte) and have the same data
7476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // in both lanes of 128 bit register.
7576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
7676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // have the same data in both lanes of a 256 bit register
7793a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
7876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
7976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the first 16 bits (first and second byte)
8076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
8176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  firstFilters = _mm256_shuffle_epi8(filtersReg32,
8276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x100u));
8376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the second 16 bits (third and forth byte)
8476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
8576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  secondFilters = _mm256_shuffle_epi8(filtersReg32,
8676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_set1_epi16(0x302u));
8776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the third 16 bits (fifth and sixth byte)
8876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
8976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
9076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x504u));
9176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the forth 16 bits (seventh and eighth byte)
9276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
9376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  forthFilters = _mm256_shuffle_epi8(filtersReg32,
9476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x706u));
9576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
9676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
9776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
9876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
9976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
10076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
10176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // multiple the size of the source and destination stride by two
10276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  src_stride = src_pixels_per_line << 1;
10376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  dst_stride = output_pitch << 1;
10476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  for (i = output_height; i > 1; i-=2) {
10576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // load the 2 strides of source
10676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg32b1 = _mm256_castsi128_si256(
10776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
10876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
10976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm_loadu_si128((__m128i *)
11076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 (src_ptr+src_pixels_per_line-3)), 1);
11176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
11276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
11376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
11488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
11576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
11676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
11776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
11888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
11976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
12076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
12176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
12276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
12376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
12488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
12576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
12676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
12776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
12888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
12976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
13076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
13176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
13276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
13376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
13476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
13576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // reading 2 strides of the next 16 bytes
13676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // (part of it was being read by earlier read)
13776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg32b2 = _mm256_castsi128_si256(
13876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
13976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
14076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm_loadu_si128((__m128i *)
14176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 (src_ptr+src_pixels_per_line+5)), 1);
14276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
14376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
14476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
14576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
14676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
14776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
14876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
14988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
15076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
15176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
15276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
15388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
15476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
15576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
15676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
15776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
15876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
15988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
16076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
16176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
16276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
16388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
16476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
16576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
16676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
16776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
16876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
16976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
17076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
17176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
17276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
17376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
17476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
17576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
17676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
17776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shift by 7 bit each 16 bit
17876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
17976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
18076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
18176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shrink to 8 bit each 16 bits, the first lane contain the first
18276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // convolve result and the second lane contain the second convolve
18376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // result
18476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
18576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                           srcRegFilt32b2_1);
18676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
18776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    src_ptr+=src_stride;
18876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
18976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // save 16 bytes
19076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm_store_si128((__m128i*)output_ptr,
19176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm256_castsi256_si128(srcRegFilt32b1_1));
19276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
19376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // save the next 16 bits
19476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
19576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
19676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    output_ptr+=dst_stride;
19776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  }
19876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
19976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // if the number of strides is odd.
20076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // process only 16 bytes
20176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  if (i > 0) {
20276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
20376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    __m128i srcRegFilt2, srcRegFilt3;
20476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
20576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
20676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
20776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
20876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
20976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(filt1Reg));
21076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
21188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(filt4Reg));
21276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
21376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
21476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
21576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(firstFilters));
21676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
21788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(forthFilters));
21876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
21976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
22076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
22176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
22276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
22376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
22488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                 _mm256_castsi256_si128(filt2Reg));
22576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
22676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_castsi256_si128(filt3Reg));
22776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
22876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
22976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
23088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(secondFilters));
23176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
23276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(thirdFilters));
23376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
23476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
23576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
23676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
23776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
23876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // reading the next 16 bytes
23976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // (part of it was being read by earlier read)
24076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
24176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
24276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
24376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
24476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
24576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
24676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
24776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
24876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(filt1Reg));
24976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
25088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(filt4Reg));
25176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
25276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
25376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
25476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(firstFilters));
25576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
25688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(forthFilters));
25776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
25876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
25976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
26076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
26176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // filter the source buffer
26276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
26388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(filt2Reg));
26476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
26576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(filt3Reg));
26676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
26776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
26876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
26988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                  _mm256_castsi256_si128(secondFilters));
27076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
27176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(thirdFilters));
27276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
27376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
27476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
27576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
27676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
27776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));
27876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
27976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
28076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
28176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(addFilterReg64));
28276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
28376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
28476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                    _mm256_castsi256_si128(addFilterReg64));
28576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
28676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shift by 7 bit each 16 bit
28776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
28876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
28976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
29076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shrink to 8 bit each 16 bits, the first lane contain the first
29176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // convolve result and the second lane contain the second convolve
29276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // result
29376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
29476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
29576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // save 16 bytes
29676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
29776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  }
29876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org}
29976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
30076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.orgvoid vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
30176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int src_pitch,
30276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned char *output_ptr,
30376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int out_pitch,
30476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  unsigned int output_height,
30576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                                  int16_t *filter) {
30676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m128i filtersReg;
30776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i addFilterReg64;
30876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
30976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
310ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org  __m256i srcReg32b11, srcReg32b12, filtersReg32;
31176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
31276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  unsigned int i;
31376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  unsigned int src_stride, dst_stride;
31476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
31576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
31676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
31776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filtersReg = _mm_loadu_si128((__m128i *)filter);
31876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // converting the 16 bit (short) to  8 bit (byte) and have the
31976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // same data in both lanes of 128 bit register.
32076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
32176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // have the same data in both lanes of a 256 bit register
32293a74791c8e808ea76001ee07693aa2a5fdd3500johannkoenig@chromium.org  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
32376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
32476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the first 16 bits (first and second byte)
32576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
32676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  firstFilters = _mm256_shuffle_epi8(filtersReg32,
32776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x100u));
32876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the second 16 bits (third and forth byte)
32976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
33076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  secondFilters = _mm256_shuffle_epi8(filtersReg32,
33176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_set1_epi16(0x302u));
33276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the third 16 bits (fifth and sixth byte)
33376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
33476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
33576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x504u));
33676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // duplicate only the forth 16 bits (seventh and eighth byte)
33776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // across 256 bit register
33876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  forthFilters = _mm256_shuffle_epi8(filtersReg32,
33976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                 _mm256_set1_epi16(0x706u));
34076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
34176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // multiple the size of the source and destination stride by two
34276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  src_stride = src_pitch << 1;
34376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  dst_stride = out_pitch << 1;
34476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
34576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // load 16 bytes 7 times in stride of src_pitch
34676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b1 = _mm256_castsi128_si256(
34776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr)));
34876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b2 = _mm256_castsi128_si256(
34976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
35076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b3 = _mm256_castsi128_si256(
35176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
35276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b4 = _mm256_castsi128_si256(
35376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
35476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b5 = _mm256_castsi128_si256(
35576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
35676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b6 = _mm256_castsi128_si256(
35776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
35876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b7 = _mm256_castsi128_si256(
35976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
36076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
36176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // have each consecutive loads on the same 256 register
36276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
36376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b2), 1);
36476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
36576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b3), 1);
36676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
36776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b4), 1);
36876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
36976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b5), 1);
37076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
37176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b6), 1);
37276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
37376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org               _mm256_castsi256_si128(srcReg32b7), 1);
37476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
37576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // merge every two consecutive registers except the last one
37676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
37776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
37876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
37976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // save
38076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
38176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
38276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // save
38376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
38476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
38576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // save
38676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
38776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
38876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  // save
38976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
39076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
39176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
39276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  for (i = output_height; i > 1; i-=2) {
39376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // load the last 2 loads of 16 bytes and have every two
39476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // consecutive loads in the same 256 bit register
39576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b8 = _mm256_castsi128_si256(
39676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
39776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
39876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm256_castsi256_si128(srcReg32b8), 1);
39976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b9 = _mm256_castsi128_si256(
40076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
40176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
40276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm256_castsi256_si128(srcReg32b9), 1);
40376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
40476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // merge every two consecutive registers
40576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // save
40676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
40776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
40876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
40976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // multiply 2 adjacent elements with the filter and add the result
41076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
41176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
41276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
41376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // add and saturate the results together
41476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
41576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
41676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // multiply 2 adjacent elements with the filter and add the result
41776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
41876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
41976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
42076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // add and saturate the results together
42176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
42276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
42376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
42476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
42576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
426ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     // multiply 2 adjacent elements with the filter and add the result
427ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
428ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
429ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org
430ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
431ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org
432ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     // multiply 2 adjacent elements with the filter and add the result
433ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
434ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
435ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org
436ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     // add and saturate the results together
437ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
438ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
439ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
440ace65784417788374f0b19ce5a8abd06c9ccd007johannkoenig@chromium.org                  _mm256_max_epi16(srcReg32b8, srcReg32b12));
44176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
44276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
44376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
44476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
44576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // shift by 7 bit each 16 bit
44676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
44776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
44876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
44976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // shrink to 8 bit each 16 bits, the first lane contain the first
45076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // convolve result and the second lane contain the second convolve
45176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // result
45276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
45376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
45476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     src_ptr+=src_stride;
45576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
45676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // save 16 bytes
45776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm_store_si128((__m128i*)output_ptr,
45876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm256_castsi256_si128(srcReg32b1));
45976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
46076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // save the next 16 bits
46176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
46276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     _mm256_extractf128_si256(srcReg32b1, 1));
46376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
46476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     output_ptr+=dst_stride;
46576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
46676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     // save part of the registers for next strides
46776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b10 = srcReg32b11;
46876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b1 = srcReg32b3;
46976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b11 = srcReg32b2;
47076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b3 = srcReg32b5;
47176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b2 = srcReg32b4;
47276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b5 = srcReg32b7;
47376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org     srcReg32b7 = srcReg32b9;
47476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  }
47576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  if (i > 0) {
47676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
47776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
47876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // load the last 16 bytes
47976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
48076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
48176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // merge the last 2 results together
48276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt4 = _mm_unpacklo_epi8(
48376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
48476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt7 = _mm_unpackhi_epi8(
48576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
48676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
48776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
48876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
48976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(firstFilters));
49076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
49176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(forthFilters));
49276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
49376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(firstFilters));
49476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
49576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(forthFilters));
49676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
49776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
49876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
49976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
50076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
50176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
50276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
50376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
50476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(secondFilters));
50576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
50676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(secondFilters));
50776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
50876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // multiply 2 adjacent elements with the filter and add the result
50976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
51076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(thirdFilters));
51176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
51276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(thirdFilters));
51376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
51476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
51576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
51676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
51776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
51876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));
51976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
52076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // add and saturate the results together
52176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
52276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
52376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
52476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));
52576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
52676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
52776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
52876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(addFilterReg64));
52976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
53076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org                  _mm256_castsi256_si128(addFilterReg64));
53176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
53276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shift by 7 bit each 16 bit
53376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
53476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
53576e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
53676e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // shrink to 8 bit each 16 bits, the first lane contain the first
53776e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // convolve result and the second lane contain the second convolve
53876e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // result
53976e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
54076e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org
54176e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    // save 16 bytes
54276e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
54376e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org  }
54476e516e2154f353aa02c504bac88afb0f95fefa7johannkoenig@chromium.org}
545