10a39d0a697ff3603e8c100300fda363658e10b23James Zern/*
20a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
30a39d0a697ff3603e8c100300fda363658e10b23James Zern *
40a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Use of this source code is governed by a BSD-style license
50a39d0a697ff3603e8c100300fda363658e10b23James Zern *  that can be found in the LICENSE file in the root of the source
60a39d0a697ff3603e8c100300fda363658e10b23James Zern *  tree. An additional intellectual property rights grant can be found
70a39d0a697ff3603e8c100300fda363658e10b23James Zern *  in the file PATENTS.  All contributing project authors may
80a39d0a697ff3603e8c100300fda363658e10b23James Zern *  be found in the AUTHORS file in the root of the source tree.
90a39d0a697ff3603e8c100300fda363658e10b23James Zern */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern
110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <immintrin.h>
120a39d0a697ff3603e8c100300fda363658e10b23James Zern
130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h"
140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/convolve.h"
150a39d0a697ff3603e8c100300fda363658e10b23James Zern
160a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
170a39d0a697ff3603e8c100300fda363658e10b23James Zern// Copy and average
180a39d0a697ff3603e8c100300fda363658e10b23James Zern
190a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
200a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   uint16_t *dst, ptrdiff_t dst_stride,
210a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   const int16_t *filter_x, int filter_x_stride,
220a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   const int16_t *filter_y, int filter_y_stride,
230a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   int width, int h, int bd) {
240a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_x;
250a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_y;
260a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_x_stride;
270a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_y_stride;
280a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)bd;
290a39d0a697ff3603e8c100300fda363658e10b23James Zern
300a39d0a697ff3603e8c100300fda363658e10b23James Zern  assert(width % 4 == 0);
310a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (width > 32) {  // width = 64
320a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
330a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
340a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
350a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
360a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
370a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
380a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, p0);
390a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
400a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
410a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
420a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
430a39d0a697ff3603e8c100300fda363658e10b23James Zern      h--;
440a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
450a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 16) {  // width = 32
460a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
470a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
480a39d0a697ff3603e8c100300fda363658e10b23James Zern      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
490a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
500a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, p0);
510a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
520a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
530a39d0a697ff3603e8c100300fda363658e10b23James Zern      h--;
540a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
550a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 8) {  // width = 16
560a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m256i p0, p1;
570a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
580a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm256_loadu_si256((const __m256i *)src);
590a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
600a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm256_loadu_si256((const __m256i *)src);
610a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
620a39d0a697ff3603e8c100300fda363658e10b23James Zern
630a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, p0);
640a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
650a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, p1);
660a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
670a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
680a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
690a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 4) {  // width = 8
700a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m128i p0, p1;
710a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
720a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm_loadu_si128((const __m128i *)src);
730a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
740a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm_loadu_si128((const __m128i *)src);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
760a39d0a697ff3603e8c100300fda363658e10b23James Zern
770a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storeu_si128((__m128i *)dst, p0);
780a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
790a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storeu_si128((__m128i *)dst, p1);
800a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
810a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
820a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
830a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {  // width = 4
840a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m128i p0, p1;
850a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
860a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm_loadl_epi64((const __m128i *)src);
870a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
880a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm_loadl_epi64((const __m128i *)src);
890a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
900a39d0a697ff3603e8c100300fda363658e10b23James Zern
910a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storel_epi64((__m128i *)dst, p0);
920a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
930a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storel_epi64((__m128i *)dst, p1);
940a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
950a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
960a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
970a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
980a39d0a697ff3603e8c100300fda363658e10b23James Zern}
990a39d0a697ff3603e8c100300fda363658e10b23James Zern
1000a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern                                  uint16_t *dst, ptrdiff_t dst_stride,
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern                                  const int16_t *filter_x, int filter_x_stride,
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern                                  const int16_t *filter_y, int filter_y_stride,
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern                                  int width, int h, int bd) {
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_x;
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_y;
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_x_stride;
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)filter_y_stride;
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern  (void)bd;
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern  assert(width % 4 == 0);
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (width > 32) {  // width = 64
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m256i p0, p1, p2, p3, u0, u1, u2, u3;
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm256_loadu_si256((const __m256i *)src);
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern      p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern      p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern      u0 = _mm256_loadu_si256((const __m256i *)dst);
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern      u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern      u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
1270a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
1290a39d0a697ff3603e8c100300fda363658e10b23James Zern      h--;
1300a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
1310a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 16) {  // width = 32
1320a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m256i p0, p1, u0, u1;
1330a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
1340a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm256_loadu_si256((const __m256i *)src);
1350a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
1360a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride;
1370a39d0a697ff3603e8c100300fda363658e10b23James Zern      u0 = _mm256_loadu_si256((const __m256i *)dst);
1380a39d0a697ff3603e8c100300fda363658e10b23James Zern      u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
1390a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
1400a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
1410a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride;
1420a39d0a697ff3603e8c100300fda363658e10b23James Zern      h--;
1430a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
1440a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 8) {  // width = 16
1450a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m256i p0, p1, u0, u1;
1460a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
1470a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm256_loadu_si256((const __m256i *)src);
1480a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
1490a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride << 1;
1500a39d0a697ff3603e8c100300fda363658e10b23James Zern      u0 = _mm256_loadu_si256((const __m256i *)dst);
1510a39d0a697ff3603e8c100300fda363658e10b23James Zern      u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
1520a39d0a697ff3603e8c100300fda363658e10b23James Zern
1530a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
1540a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_storeu_si256((__m256i *)(dst + dst_stride),
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern                          _mm256_avg_epu16(p1, u1));
1560a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride << 1;
1570a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
1580a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
1590a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else if (width > 4) {  // width = 8
1600a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m128i p0, p1, u0, u1;
1610a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
1620a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm_loadu_si128((const __m128i *)src);
1630a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
1640a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride << 1;
1650a39d0a697ff3603e8c100300fda363658e10b23James Zern      u0 = _mm_loadu_si128((const __m128i *)dst);
1660a39d0a697ff3603e8c100300fda363658e10b23James Zern      u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
1670a39d0a697ff3603e8c100300fda363658e10b23James Zern
1680a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
1690a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
1700a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride << 1;
1710a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
1720a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
1730a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {  // width = 4
1740a39d0a697ff3603e8c100300fda363658e10b23James Zern    __m128i p0, p1, u0, u1;
1750a39d0a697ff3603e8c100300fda363658e10b23James Zern    do {
1760a39d0a697ff3603e8c100300fda363658e10b23James Zern      p0 = _mm_loadl_epi64((const __m128i *)src);
1770a39d0a697ff3603e8c100300fda363658e10b23James Zern      p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
1780a39d0a697ff3603e8c100300fda363658e10b23James Zern      src += src_stride << 1;
1790a39d0a697ff3603e8c100300fda363658e10b23James Zern      u0 = _mm_loadl_epi64((const __m128i *)dst);
1800a39d0a697ff3603e8c100300fda363658e10b23James Zern      u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
1810a39d0a697ff3603e8c100300fda363658e10b23James Zern
1820a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
1830a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
1840a39d0a697ff3603e8c100300fda363658e10b23James Zern      dst += dst_stride << 1;
1850a39d0a697ff3603e8c100300fda363658e10b23James Zern      h -= 2;
1860a39d0a697ff3603e8c100300fda363658e10b23James Zern    } while (h > 0);
1870a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
1880a39d0a697ff3603e8c100300fda363658e10b23James Zern}
1890a39d0a697ff3603e8c100300fda363658e10b23James Zern
1900a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
1910a39d0a697ff3603e8c100300fda363658e10b23James Zern// Horizontal and vertical filtering
1920a39d0a697ff3603e8c100300fda363658e10b23James Zern
1930a39d0a697ff3603e8c100300fda363658e10b23James Zern#define CONV8_ROUNDING_BITS (7)
1940a39d0a697ff3603e8c100300fda363658e10b23James Zern
1950a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
1960a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
1970a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
1980a39d0a697ff3603e8c100300fda363658e10b23James Zern
1990a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
2000a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              8, 9, 10, 11, 10, 11, 12, 13,
2010a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              4, 5, 6,  7,  6,  7,  8,  9,
2020a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              8, 9, 10, 11, 10, 11, 12, 13 };
2030a39d0a697ff3603e8c100300fda363658e10b23James Zern
2040a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
2050a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              10, 11, 12, 13, 12, 13, 14, 15,
2060a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              6,  7,  8,  9,  8,  9,  10, 11,
2070a39d0a697ff3603e8c100300fda363658e10b23James Zern                                              10, 11, 12, 13, 12, 13, 14, 15 };
2080a39d0a697ff3603e8c100300fda363658e10b23James Zern
2090a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
2100a39d0a697ff3603e8c100300fda363658e10b23James Zern
2110a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
2120a39d0a697ff3603e8c100300fda363658e10b23James Zern// Horizontal Filtering
2130a39d0a697ff3603e8c100300fda363658e10b23James Zern
2140a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
2150a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
2160a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
2170a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
2180a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
2190a39d0a697ff3603e8c100300fda363658e10b23James Zern
2200a39d0a697ff3603e8c100300fda363658e10b23James Zern  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
2210a39d0a697ff3603e8c100300fda363658e10b23James Zern  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
2220a39d0a697ff3603e8c100300fda363658e10b23James Zern  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
2230a39d0a697ff3603e8c100300fda363658e10b23James Zern  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
2240a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2250a39d0a697ff3603e8c100300fda363658e10b23James Zern
2260a39d0a697ff3603e8c100300fda363658e10b23James Zern// Note:
2270a39d0a697ff3603e8c100300fda363658e10b23James Zern//  Shared by 8x2 and 16x1 block
2280a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
2290a39d0a697ff3603e8c100300fda363658e10b23James Zern                                  __m256i *x /*x[8]*/) {
2300a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i pp[8];
2310a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_pixels(s0, pp);
2320a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_pixels(s1, &pp[4]);
2330a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
2340a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
2350a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
2360a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
2370a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[4] = x[2];
2380a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[5] = x[3];
2390a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
2400a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
2410a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2420a39d0a697ff3603e8c100300fda363658e10b23James Zern
2430a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
2440a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i pp[8];
2450a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s0;
2460a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0 = _mm256_loadu_si256((const __m256i *)src);
2470a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_pixels(&s0, pp);
2480a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
2490a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
2500a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
2510a39d0a697ff3603e8c100300fda363658e10b23James Zern  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
2520a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2530a39d0a697ff3603e8c100300fda363658e10b23James Zern
2540a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
2550a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   __m256i *x) {
2560a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s0, s1;
2570a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0 = _mm256_loadu_si256((const __m256i *)src);
2580a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
2590a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16_pixels(&s0, &s1, x);
2600a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2610a39d0a697ff3603e8c100300fda363658e10b23James Zern
2620a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
2630a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s0, s1;
2640a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0 = _mm256_loadu_si256((const __m256i *)src);
2650a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
2660a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16_pixels(&s0, &s1, x);
2670a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2680a39d0a697ff3603e8c100300fda363658e10b23James Zern
2690a39d0a697ff3603e8c100300fda363658e10b23James Zern// Note:
2700a39d0a697ff3603e8c100300fda363658e10b23James Zern//  Shared by horizontal and vertical filtering
2710a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
2720a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
2730a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
2740a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i p0 = _mm256_set1_epi32(0x03020100);
2750a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i p1 = _mm256_set1_epi32(0x07060504);
2760a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
2770a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
2780a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[0] = _mm256_shuffle_epi8(hh, p0);
2790a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[1] = _mm256_shuffle_epi8(hh, p1);
2800a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[2] = _mm256_shuffle_epi8(hh, p2);
2810a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[3] = _mm256_shuffle_epi8(hh, p3);
2820a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2830a39d0a697ff3603e8c100300fda363658e10b23James Zern
2840a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
2850a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     const __m256i *fil /*fil[4]*/,
2860a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     __m256i *y) {
2870a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i a, a0, a1;
2880a39d0a697ff3603e8c100300fda363658e10b23James Zern
2890a39d0a697ff3603e8c100300fda363658e10b23James Zern  a0 = _mm256_madd_epi16(fil[0], sig[0]);
2900a39d0a697ff3603e8c100300fda363658e10b23James Zern  a1 = _mm256_madd_epi16(fil[3], sig[3]);
2910a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_add_epi32(a0, a1);
2920a39d0a697ff3603e8c100300fda363658e10b23James Zern
2930a39d0a697ff3603e8c100300fda363658e10b23James Zern  a0 = _mm256_madd_epi16(fil[1], sig[1]);
2940a39d0a697ff3603e8c100300fda363658e10b23James Zern  a1 = _mm256_madd_epi16(fil[2], sig[2]);
2950a39d0a697ff3603e8c100300fda363658e10b23James Zern
2960a39d0a697ff3603e8c100300fda363658e10b23James Zern  {
2970a39d0a697ff3603e8c100300fda363658e10b23James Zern    const __m256i min = _mm256_min_epi32(a0, a1);
2980a39d0a697ff3603e8c100300fda363658e10b23James Zern    a = _mm256_add_epi32(a, min);
2990a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
3000a39d0a697ff3603e8c100300fda363658e10b23James Zern  {
3010a39d0a697ff3603e8c100300fda363658e10b23James Zern    const __m256i max = _mm256_max_epi32(a0, a1);
3020a39d0a697ff3603e8c100300fda363658e10b23James Zern    a = _mm256_add_epi32(a, max);
3030a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
3040a39d0a697ff3603e8c100300fda363658e10b23James Zern  {
3050a39d0a697ff3603e8c100300fda363658e10b23James Zern    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
3060a39d0a697ff3603e8c100300fda363658e10b23James Zern    a = _mm256_add_epi32(a, rounding);
3070a39d0a697ff3603e8c100300fda363658e10b23James Zern    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
3080a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
3090a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3100a39d0a697ff3603e8c100300fda363658e10b23James Zern
3110a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
3120a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    uint16_t *dst) {
3130a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i a0 = _mm256_castsi256_si128(*y);
3140a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
3150a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i res = _mm_packus_epi32(a0, a1);
3160a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
3170a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, res);
3180a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3190a39d0a697ff3603e8c100300fda363658e10b23James Zern
3200a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
3210a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    const __m256i *mask, uint16_t *dst,
3220a39d0a697ff3603e8c100300fda363658e10b23James Zern                                    ptrdiff_t pitch) {
3230a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i a = _mm256_packus_epi32(*y0, *y1);
3240a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_min_epi16(a, *mask);
3250a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
3260a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
3270a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3280a39d0a697ff3603e8c100300fda363658e10b23James Zern
3290a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
3300a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     const __m256i *mask, uint16_t *dst) {
3310a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i a = _mm256_packus_epi32(*y0, *y1);
3320a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_min_epi16(a, *mask);
3330a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)dst, a);
3340a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3350a39d0a697ff3603e8c100300fda363658e10b23James Zern
3360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h8_avx2(
3370a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
3380a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
3390a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[8], res0, res1;
3400a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
3410a39d0a697ff3603e8c100300fda363658e10b23James Zern
3420a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
3430a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
3440a39d0a697ff3603e8c100300fda363658e10b23James Zern
3450a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
3460a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
3470a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_pixels(src_ptr, src_pitch, signal);
3480a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
3490a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(&signal[4], ff, &res1);
3500a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
3510a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
3520a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
3530a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
3540a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 1);
3550a39d0a697ff3603e8c100300fda363658e10b23James Zern
3560a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (height > 0) {
3570a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x1_pixels(src_ptr, signal);
3580a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
3590a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_pixels(&res0, &max, dst_ptr);
3600a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
3610a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3620a39d0a697ff3603e8c100300fda363658e10b23James Zern
3630a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h8_avx2(
3640a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
3650a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
3660a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[8], res0, res1;
3670a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
3680a39d0a697ff3603e8c100300fda363658e10b23James Zern
3690a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
3700a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
3710a39d0a697ff3603e8c100300fda363658e10b23James Zern
3720a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
3730a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
3740a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x1_pixels(src_ptr, signal);
3750a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
3760a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(&signal[4], ff, &res1);
3770a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
3780a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
3790a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
3800a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
3810a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
3820a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3830a39d0a697ff3603e8c100300fda363658e10b23James Zern
3840a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
3850a39d0a697ff3603e8c100300fda363658e10b23James Zern// 2-tap horizontal filtering
3860a39d0a697ff3603e8c100300fda363658e10b23James Zern
3870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
3880a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
3890a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
3900a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i p = _mm256_set1_epi32(0x09080706);
3910a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[0] = _mm256_shuffle_epi8(hh, p);
3920a39d0a697ff3603e8c100300fda363658e10b23James Zern}
3930a39d0a697ff3603e8c100300fda363658e10b23James Zern
3940a39d0a697ff3603e8c100300fda363658e10b23James Zern// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
3950a39d0a697ff3603e8c100300fda363658e10b23James Zern// the difference is s0/s1 specifies first and second rows or,
3960a39d0a697ff3603e8c100300fda363658e10b23James Zern// first 16 samples and 8-sample shifted 16 samples
3970a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
3980a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     __m256i *sig) {
3990a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
4000a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
4010a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
4020a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
4030a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
4040a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
4050a39d0a697ff3603e8c100300fda363658e10b23James Zern  r0 = _mm256_shuffle_epi8(r0, sf2);
4060a39d0a697ff3603e8c100300fda363658e10b23James Zern  r1 = _mm256_shuffle_epi8(r1, sf2);
4070a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
4080a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
4090a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4100a39d0a697ff3603e8c100300fda363658e10b23James Zern
4110a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_2t_pixels(const uint16_t *src,
4120a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      const ptrdiff_t pitch, __m256i *sig) {
4130a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
4140a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
4150a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16_2t_pixels(&r0, &r1, sig);
4160a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4170a39d0a697ff3603e8c100300fda363658e10b23James Zern
4180a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x1_2t_pixels(const uint16_t *src,
4190a39d0a697ff3603e8c100300fda363658e10b23James Zern                                       __m256i *sig /*sig[2]*/) {
4200a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
4210a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
4220a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16_2t_pixels(&r0, &r1, sig);
4230a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4240a39d0a697ff3603e8c100300fda363658e10b23James Zern
4250a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_2t_pixels(const uint16_t *src,
4260a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      __m256i *sig /*sig[2]*/) {
4270a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
4280a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
4290a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
4300a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
4310a39d0a697ff3603e8c100300fda363658e10b23James Zern  r0 = _mm256_permutevar8x32_epi32(r0, idx);
4320a39d0a697ff3603e8c100300fda363658e10b23James Zern  r0 = _mm256_shuffle_epi8(r0, sf2);
4330a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
4340a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4350a39d0a697ff3603e8c100300fda363658e10b23James Zern
4360a39d0a697ff3603e8c100300fda363658e10b23James Zern// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
4370a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
4380a39d0a697ff3603e8c100300fda363658e10b23James Zern                                       __m256i *y0, __m256i *y1) {
4390a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
4400a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
4410a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
4420a39d0a697ff3603e8c100300fda363658e10b23James Zern  x0 = _mm256_add_epi32(x0, rounding);
4430a39d0a697ff3603e8c100300fda363658e10b23James Zern  x1 = _mm256_add_epi32(x1, rounding);
4440a39d0a697ff3603e8c100300fda363658e10b23James Zern  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
4450a39d0a697ff3603e8c100300fda363658e10b23James Zern  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
4460a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4470a39d0a697ff3603e8c100300fda363658e10b23James Zern
4480a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
4490a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        __m256i *y0) {
4500a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
4510a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
4520a39d0a697ff3603e8c100300fda363658e10b23James Zern  x0 = _mm256_add_epi32(x0, rounding);
4530a39d0a697ff3603e8c100300fda363658e10b23James Zern  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
4540a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4550a39d0a697ff3603e8c100300fda363658e10b23James Zern
4560a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h2_avx2(
4570a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
4580a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
4590a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[2], res0, res1;
4600a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
4610a39d0a697ff3603e8c100300fda363658e10b23James Zern
4620a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
4630a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
4640a39d0a697ff3603e8c100300fda363658e10b23James Zern
4650a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
4660a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
4670a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
4680a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16_2t_pixels(signal, &ff, &res0, &res1);
4690a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
4700a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
4710a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
4720a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
4730a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 1);
4740a39d0a697ff3603e8c100300fda363658e10b23James Zern
4750a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (height > 0) {
4760a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x1_2t_pixels(src_ptr, signal);
4770a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_2t_pixels(signal, &ff, &res0);
4780a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_pixels(&res0, &max, dst_ptr);
4790a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
4800a39d0a697ff3603e8c100300fda363658e10b23James Zern}
4810a39d0a697ff3603e8c100300fda363658e10b23James Zern
4820a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h2_avx2(
4830a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
4840a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
4850a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[2], res0, res1;
4860a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
4870a39d0a697ff3603e8c100300fda363658e10b23James Zern
4880a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
4890a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
4900a39d0a697ff3603e8c100300fda363658e10b23James Zern
4910a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
4920a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
4930a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x1_2t_pixels(src_ptr, signal);
4940a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16_2t_pixels(signal, &ff, &res0, &res1);
4950a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
4960a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
4970a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
4980a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
4990a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
5000a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5010a39d0a697ff3603e8c100300fda363658e10b23James Zern
5020a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
5030a39d0a697ff3603e8c100300fda363658e10b23James Zern// Vertical Filtering
5040a39d0a697ff3603e8c100300fda363658e10b23James Zern
5050a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
5060a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
5070a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s1 =
5080a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
5090a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s2 = _mm256_castsi128_si256(
5100a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
5110a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s3 = _mm256_castsi128_si256(
5120a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
5130a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s4 = _mm256_castsi128_si256(
5140a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
5150a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s5 = _mm256_castsi128_si256(
5160a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
5170a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s6 = _mm256_castsi128_si256(
5180a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
5190a39d0a697ff3603e8c100300fda363658e10b23James Zern
5200a39d0a697ff3603e8c100300fda363658e10b23James Zern  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
5210a39d0a697ff3603e8c100300fda363658e10b23James Zern  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
5220a39d0a697ff3603e8c100300fda363658e10b23James Zern  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
5230a39d0a697ff3603e8c100300fda363658e10b23James Zern  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
5240a39d0a697ff3603e8c100300fda363658e10b23James Zern  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
5250a39d0a697ff3603e8c100300fda363658e10b23James Zern  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
5260a39d0a697ff3603e8c100300fda363658e10b23James Zern
5270a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm256_unpacklo_epi16(s0, s1);
5280a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[4] = _mm256_unpackhi_epi16(s0, s1);
5290a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[1] = _mm256_unpacklo_epi16(s2, s3);
5300a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[5] = _mm256_unpackhi_epi16(s2, s3);
5310a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = _mm256_unpacklo_epi16(s4, s5);
5320a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[6] = _mm256_unpackhi_epi16(s4, s5);
5330a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[8] = s6;
5340a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5350a39d0a697ff3603e8c100300fda363658e10b23James Zern
5360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
5370a39d0a697ff3603e8c100300fda363658e10b23James Zern                                   __m256i *sig) {
5380a39d0a697ff3603e8c100300fda363658e10b23James Zern  // base + 7th row
5390a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s0 = _mm256_castsi128_si256(
5400a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
5410a39d0a697ff3603e8c100300fda363658e10b23James Zern  // base + 8th row
5420a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s1 = _mm256_castsi128_si256(
5430a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
5440a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
5450a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
5460a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[3] = _mm256_unpacklo_epi16(s2, s3);
5470a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[7] = _mm256_unpackhi_epi16(s2, s3);
5480a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[8] = s1;
5490a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5500a39d0a697ff3603e8c100300fda363658e10b23James Zern
5510a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
5520a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     __m256i *y0, __m256i *y1) {
5530a39d0a697ff3603e8c100300fda363658e10b23James Zern  filter_8x1_pixels(sig, f, y0);
5540a39d0a697ff3603e8c100300fda363658e10b23James Zern  filter_8x1_pixels(&sig[4], f, y1);
5550a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5560a39d0a697ff3603e8c100300fda363658e10b23James Zern
5570a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void update_pixels(__m256i *sig) {
5580a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i;
5590a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 0; i < 3; ++i) {
5600a39d0a697ff3603e8c100300fda363658e10b23James Zern    sig[i] = sig[i + 1];
5610a39d0a697ff3603e8c100300fda363658e10b23James Zern    sig[i + 4] = sig[i + 5];
5620a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
5630a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5640a39d0a697ff3603e8c100300fda363658e10b23James Zern
5650a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v8_avx2(
5660a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
5670a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
5680a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[9], res0, res1;
5690a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
5700a39d0a697ff3603e8c100300fda363658e10b23James Zern
5710a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
5720a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
5730a39d0a697ff3603e8c100300fda363658e10b23James Zern
5740a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x9_init(src_ptr, src_pitch, signal);
5750a39d0a697ff3603e8c100300fda363658e10b23James Zern
5760a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
5770a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x9_pixels(src_ptr, src_pitch, signal);
5780a39d0a697ff3603e8c100300fda363658e10b23James Zern
5790a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x9_pixels(signal, ff, &res0, &res1);
5800a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
5810a39d0a697ff3603e8c100300fda363658e10b23James Zern    update_pixels(signal);
5820a39d0a697ff3603e8c100300fda363658e10b23James Zern
5830a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
5840a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
5850a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
5860a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
5870a39d0a697ff3603e8c100300fda363658e10b23James Zern}
5880a39d0a697ff3603e8c100300fda363658e10b23James Zern
5890a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
5900a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i u0, u1, u2, u3;
5910a39d0a697ff3603e8c100300fda363658e10b23James Zern  // load 0-6 rows
5920a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
5930a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
5940a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
5950a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
5960a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
5970a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
5980a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
5990a39d0a697ff3603e8c100300fda363658e10b23James Zern
6000a39d0a697ff3603e8c100300fda363658e10b23James Zern  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
6010a39d0a697ff3603e8c100300fda363658e10b23James Zern  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
6020a39d0a697ff3603e8c100300fda363658e10b23James Zern
6030a39d0a697ff3603e8c100300fda363658e10b23James Zern  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
6040a39d0a697ff3603e8c100300fda363658e10b23James Zern  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
6050a39d0a697ff3603e8c100300fda363658e10b23James Zern
6060a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm256_unpacklo_epi16(u0, u2);
6070a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[4] = _mm256_unpackhi_epi16(u0, u2);
6080a39d0a697ff3603e8c100300fda363658e10b23James Zern
6090a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[8] = _mm256_unpacklo_epi16(u1, u3);
6100a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[12] = _mm256_unpackhi_epi16(u1, u3);
6110a39d0a697ff3603e8c100300fda363658e10b23James Zern
6120a39d0a697ff3603e8c100300fda363658e10b23James Zern  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
6130a39d0a697ff3603e8c100300fda363658e10b23James Zern  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
6140a39d0a697ff3603e8c100300fda363658e10b23James Zern
6150a39d0a697ff3603e8c100300fda363658e10b23James Zern  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
6160a39d0a697ff3603e8c100300fda363658e10b23James Zern  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
6170a39d0a697ff3603e8c100300fda363658e10b23James Zern
6180a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[1] = _mm256_unpacklo_epi16(u0, u2);
6190a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[5] = _mm256_unpackhi_epi16(u0, u2);
6200a39d0a697ff3603e8c100300fda363658e10b23James Zern
6210a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[9] = _mm256_unpacklo_epi16(u1, u3);
6220a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[13] = _mm256_unpackhi_epi16(u1, u3);
6230a39d0a697ff3603e8c100300fda363658e10b23James Zern
6240a39d0a697ff3603e8c100300fda363658e10b23James Zern  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
6250a39d0a697ff3603e8c100300fda363658e10b23James Zern  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
6260a39d0a697ff3603e8c100300fda363658e10b23James Zern
6270a39d0a697ff3603e8c100300fda363658e10b23James Zern  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
6280a39d0a697ff3603e8c100300fda363658e10b23James Zern  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
6290a39d0a697ff3603e8c100300fda363658e10b23James Zern
6300a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = _mm256_unpacklo_epi16(u0, u2);
6310a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[6] = _mm256_unpackhi_epi16(u0, u2);
6320a39d0a697ff3603e8c100300fda363658e10b23James Zern
6330a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[10] = _mm256_unpacklo_epi16(u1, u3);
6340a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[14] = _mm256_unpackhi_epi16(u1, u3);
6350a39d0a697ff3603e8c100300fda363658e10b23James Zern
6360a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[16] = s6;
6370a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6380a39d0a697ff3603e8c100300fda363658e10b23James Zern
6390a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
6400a39d0a697ff3603e8c100300fda363658e10b23James Zern                             __m256i *sig) {
6410a39d0a697ff3603e8c100300fda363658e10b23James Zern  // base + 7th row
6420a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
6430a39d0a697ff3603e8c100300fda363658e10b23James Zern  // base + 8th row
6440a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
6450a39d0a697ff3603e8c100300fda363658e10b23James Zern
6460a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i u0, u1, u2, u3;
6470a39d0a697ff3603e8c100300fda363658e10b23James Zern  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
6480a39d0a697ff3603e8c100300fda363658e10b23James Zern  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
6490a39d0a697ff3603e8c100300fda363658e10b23James Zern
6500a39d0a697ff3603e8c100300fda363658e10b23James Zern  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
6510a39d0a697ff3603e8c100300fda363658e10b23James Zern  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
6520a39d0a697ff3603e8c100300fda363658e10b23James Zern
6530a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[3] = _mm256_unpacklo_epi16(u0, u2);
6540a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[7] = _mm256_unpackhi_epi16(u0, u2);
6550a39d0a697ff3603e8c100300fda363658e10b23James Zern
6560a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[11] = _mm256_unpacklo_epi16(u1, u3);
6570a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[15] = _mm256_unpackhi_epi16(u1, u3);
6580a39d0a697ff3603e8c100300fda363658e10b23James Zern
6590a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[16] = s8;
6600a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6610a39d0a697ff3603e8c100300fda363658e10b23James Zern
6620a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
6630a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      __m256i *y0, __m256i *y1) {
6640a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i res[4];
6650a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i;
6660a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 0; i < 4; ++i) {
6670a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
6680a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6690a39d0a697ff3603e8c100300fda363658e10b23James Zern
6700a39d0a697ff3603e8c100300fda363658e10b23James Zern  {
6710a39d0a697ff3603e8c100300fda363658e10b23James Zern    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
6720a39d0a697ff3603e8c100300fda363658e10b23James Zern    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
6730a39d0a697ff3603e8c100300fda363658e10b23James Zern    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
6740a39d0a697ff3603e8c100300fda363658e10b23James Zern    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
6750a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
6760a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6770a39d0a697ff3603e8c100300fda363658e10b23James Zern
6780a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
6790a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     const __m256i *mask, uint16_t *dst,
6800a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     ptrdiff_t pitch) {
6810a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i p = _mm256_min_epi16(*y0, *mask);
6820a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)dst, p);
6830a39d0a697ff3603e8c100300fda363658e10b23James Zern  p = _mm256_min_epi16(*y1, *mask);
6840a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
6850a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6860a39d0a697ff3603e8c100300fda363658e10b23James Zern
6870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void update_16x9_pixels(__m256i *sig) {
6880a39d0a697ff3603e8c100300fda363658e10b23James Zern  update_pixels(&sig[0]);
6890a39d0a697ff3603e8c100300fda363658e10b23James Zern  update_pixels(&sig[8]);
6900a39d0a697ff3603e8c100300fda363658e10b23James Zern}
6910a39d0a697ff3603e8c100300fda363658e10b23James Zern
6920a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v8_avx2(
6930a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
6940a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
6950a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[17], res0, res1;
6960a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
6970a39d0a697ff3603e8c100300fda363658e10b23James Zern
6980a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
6990a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
7000a39d0a697ff3603e8c100300fda363658e10b23James Zern
7010a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16x9_init(src_ptr, src_pitch, signal);
7020a39d0a697ff3603e8c100300fda363658e10b23James Zern
7030a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
7040a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x9_pixels(src_ptr, src_pitch, signal);
7050a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16x9_pixels(signal, ff, &res0, &res1);
7060a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
7070a39d0a697ff3603e8c100300fda363658e10b23James Zern    update_16x9_pixels(signal);
7080a39d0a697ff3603e8c100300fda363658e10b23James Zern
7090a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
7100a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
7110a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
7120a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
7130a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7140a39d0a697ff3603e8c100300fda363658e10b23James Zern
7150a39d0a697ff3603e8c100300fda363658e10b23James Zern// -----------------------------------------------------------------------------
7160a39d0a697ff3603e8c100300fda363658e10b23James Zern// 2-tap vertical filtering
7170a39d0a697ff3603e8c100300fda363658e10b23James Zern
7180a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x2_init(const uint16_t *src, __m256i *sig) {
7190a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = _mm256_loadu_si256((const __m256i *)src);
7200a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7210a39d0a697ff3603e8c100300fda363658e10b23James Zern
7220a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
7230a39d0a697ff3603e8c100300fda363658e10b23James Zern                                       __m256i *sig) {
7240a39d0a697ff3603e8c100300fda363658e10b23James Zern  // load the next row
7250a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
7260a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
7270a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
7280a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = u;
7290a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7300a39d0a697ff3603e8c100300fda363658e10b23James Zern
7310a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
7320a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         __m256i *y0, __m256i *y1) {
7330a39d0a697ff3603e8c100300fda363658e10b23James Zern  filter_16_2t_pixels(sig, f, y0, y1);
7340a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7350a39d0a697ff3603e8c100300fda363658e10b23James Zern
7360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v2_avx2(
7370a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
7380a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
7390a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[3], res0, res1;
7400a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
7410a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
7420a39d0a697ff3603e8c100300fda363658e10b23James Zern
7430a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
7440a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16x2_init(src_ptr, signal);
7450a39d0a697ff3603e8c100300fda363658e10b23James Zern
7460a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
7470a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
7480a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
7490a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
7500a39d0a697ff3603e8c100300fda363658e10b23James Zern
7510a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
7520a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
7530a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
7540a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
7550a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7560a39d0a697ff3603e8c100300fda363658e10b23James Zern
7570a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
7580a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
7590a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i p = _mm_set1_epi32(0x09080706);
7600a39d0a697ff3603e8c100300fda363658e10b23James Zern  f[0] = _mm_shuffle_epi8(h, p);
7610a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7620a39d0a697ff3603e8c100300fda363658e10b23James Zern
7630a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_8x2_init(const uint16_t *src, __m128i *sig) {
7640a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = _mm_loadu_si128((const __m128i *)src);
7650a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7660a39d0a697ff3603e8c100300fda363658e10b23James Zern
7670a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
7680a39d0a697ff3603e8c100300fda363658e10b23James Zern                                          __m128i *sig) {
7690a39d0a697ff3603e8c100300fda363658e10b23James Zern  // load the next row
7700a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
7710a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[0] = _mm_unpacklo_epi16(sig[2], u);
7720a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[1] = _mm_unpackhi_epi16(sig[2], u);
7730a39d0a697ff3603e8c100300fda363658e10b23James Zern  sig[2] = u;
7740a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7750a39d0a697ff3603e8c100300fda363658e10b23James Zern
7760a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
7770a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      __m128i *y0, __m128i *y1) {
7780a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
7790a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i x0 = _mm_madd_epi16(sig[0], *f);
7800a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i x1 = _mm_madd_epi16(sig[1], *f);
7810a39d0a697ff3603e8c100300fda363658e10b23James Zern  x0 = _mm_add_epi32(x0, rounding);
7820a39d0a697ff3603e8c100300fda363658e10b23James Zern  x1 = _mm_add_epi32(x1, rounding);
7830a39d0a697ff3603e8c100300fda363658e10b23James Zern  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
7840a39d0a697ff3603e8c100300fda363658e10b23James Zern  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
7850a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7860a39d0a697ff3603e8c100300fda363658e10b23James Zern
7870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
7880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                           const __m128i *mask, uint16_t *dst) {
7890a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i res = _mm_packus_epi32(*y0, *y1);
7900a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_min_epi16(res, *mask);
7910a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, res);
7920a39d0a697ff3603e8c100300fda363658e10b23James Zern}
7930a39d0a697ff3603e8c100300fda363658e10b23James Zern
7940a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v2_avx2(
7950a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
7960a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
7970a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i signal[3], res0, res1;
7980a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
7990a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i ff;
8000a39d0a697ff3603e8c100300fda363658e10b23James Zern
8010a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x1_2t_filter(filter, &ff);
8020a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x2_init(src_ptr, signal);
8030a39d0a697ff3603e8c100300fda363658e10b23James Zern
8040a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
8050a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
8060a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8_2t_pixels(signal, &ff, &res0, &res1);
8070a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
8080a39d0a697ff3603e8c100300fda363658e10b23James Zern
8090a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
8100a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
8110a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
8120a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
8130a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8140a39d0a697ff3603e8c100300fda363658e10b23James Zern
8150a39d0a697ff3603e8c100300fda363658e10b23James Zern// Calculation with averaging the input pixels
8160a39d0a697ff3603e8c100300fda363658e10b23James Zern
8170a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
8180a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        uint16_t *dst) {
8190a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i a0 = _mm256_castsi256_si128(*y0);
8200a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
8210a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i res = _mm_packus_epi32(a0, a1);
8220a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
8230a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
8240a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_avg_epu16(res, pix);
8250a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, res);
8260a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8270a39d0a697ff3603e8c100300fda363658e10b23James Zern
8280a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
8290a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        const __m256i *mask, uint16_t *dst,
8300a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        ptrdiff_t pitch) {
8310a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i a = _mm256_packus_epi32(*y0, *y1);
8320a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
8330a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
8340a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i pix =
8350a39d0a697ff3603e8c100300fda363658e10b23James Zern      _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
8360a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_min_epi16(a, *mask);
8370a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_avg_epu16(a, pix);
8380a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
8390a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
8400a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8410a39d0a697ff3603e8c100300fda363658e10b23James Zern
8420a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
8430a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         const __m256i *mask, uint16_t *dst) {
8440a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i a = _mm256_packus_epi32(*y0, *y1);
8450a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
8460a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_min_epi16(a, *mask);
8470a39d0a697ff3603e8c100300fda363658e10b23James Zern  a = _mm256_avg_epu16(a, pix);
8480a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)dst, a);
8490a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8500a39d0a697ff3603e8c100300fda363658e10b23James Zern
8510a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
8520a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         const __m256i *mask, uint16_t *dst,
8530a39d0a697ff3603e8c100300fda363658e10b23James Zern                                         ptrdiff_t pitch) {
8540a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
8550a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
8560a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i p = _mm256_min_epi16(*y0, *mask);
8570a39d0a697ff3603e8c100300fda363658e10b23James Zern  p = _mm256_avg_epu16(p, pix0);
8580a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)dst, p);
8590a39d0a697ff3603e8c100300fda363658e10b23James Zern
8600a39d0a697ff3603e8c100300fda363658e10b23James Zern  p = _mm256_min_epi16(*y1, *mask);
8610a39d0a697ff3603e8c100300fda363658e10b23James Zern  p = _mm256_avg_epu16(p, pix1);
8620a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
8630a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8640a39d0a697ff3603e8c100300fda363658e10b23James Zern
8650a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
8660a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               const __m128i *y1,
8670a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               const __m128i *mask,
8680a39d0a697ff3603e8c100300fda363658e10b23James Zern                                               uint16_t *dst) {
8690a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i res = _mm_packus_epi32(*y0, *y1);
8700a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
8710a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_min_epi16(res, *mask);
8720a39d0a697ff3603e8c100300fda363658e10b23James Zern  res = _mm_avg_epu16(res, pix);
8730a39d0a697ff3603e8c100300fda363658e10b23James Zern  _mm_storeu_si128((__m128i *)dst, res);
8740a39d0a697ff3603e8c100300fda363658e10b23James Zern}
8750a39d0a697ff3603e8c100300fda363658e10b23James Zern
8760a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h8_avg_avx2(
8770a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
8780a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
8790a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[8], res0, res1;
8800a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
8810a39d0a697ff3603e8c100300fda363658e10b23James Zern
8820a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
8830a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
8840a39d0a697ff3603e8c100300fda363658e10b23James Zern
8850a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
8860a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
8870a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_pixels(src_ptr, src_pitch, signal);
8880a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
8890a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(&signal[4], ff, &res1);
8900a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
8910a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
8920a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
8930a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
8940a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 1);
8950a39d0a697ff3603e8c100300fda363658e10b23James Zern
8960a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (height > 0) {
8970a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x1_pixels(src_ptr, signal);
8980a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
8990a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_avg_pixels(&res0, &max, dst_ptr);
9000a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
9010a39d0a697ff3603e8c100300fda363658e10b23James Zern}
9020a39d0a697ff3603e8c100300fda363658e10b23James Zern
9030a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h8_avg_avx2(
9040a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
9050a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
9060a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[8], res0, res1;
9070a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
9080a39d0a697ff3603e8c100300fda363658e10b23James Zern
9090a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
9100a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
9110a39d0a697ff3603e8c100300fda363658e10b23James Zern
9120a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
9130a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
9140a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x1_pixels(src_ptr, signal);
9150a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(signal, ff, &res0);
9160a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_pixels(&signal[4], ff, &res1);
9170a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
9180a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
9190a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
9200a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
9210a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
9220a39d0a697ff3603e8c100300fda363658e10b23James Zern}
9230a39d0a697ff3603e8c100300fda363658e10b23James Zern
9240a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v8_avg_avx2(
9250a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
9260a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
9270a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[9], res0, res1;
9280a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
9290a39d0a697ff3603e8c100300fda363658e10b23James Zern
9300a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
9310a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
9320a39d0a697ff3603e8c100300fda363658e10b23James Zern
9330a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x9_init(src_ptr, src_pitch, signal);
9340a39d0a697ff3603e8c100300fda363658e10b23James Zern
9350a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
9360a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x9_pixels(src_ptr, src_pitch, signal);
9370a39d0a697ff3603e8c100300fda363658e10b23James Zern
9380a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x9_pixels(signal, ff, &res0, &res1);
9390a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
9400a39d0a697ff3603e8c100300fda363658e10b23James Zern    update_pixels(signal);
9410a39d0a697ff3603e8c100300fda363658e10b23James Zern
9420a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
9430a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
9440a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
9450a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
9460a39d0a697ff3603e8c100300fda363658e10b23James Zern}
9470a39d0a697ff3603e8c100300fda363658e10b23James Zern
9480a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v8_avg_avx2(
9490a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
9500a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
9510a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[17], res0, res1;
9520a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
9530a39d0a697ff3603e8c100300fda363658e10b23James Zern
9540a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff[4];
9550a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_filters(filter, ff);
9560a39d0a697ff3603e8c100300fda363658e10b23James Zern
9570a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16x9_init(src_ptr, src_pitch, signal);
9580a39d0a697ff3603e8c100300fda363658e10b23James Zern
9590a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
9600a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x9_pixels(src_ptr, src_pitch, signal);
9610a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16x9_pixels(signal, ff, &res0, &res1);
9620a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
9630a39d0a697ff3603e8c100300fda363658e10b23James Zern    update_16x9_pixels(signal);
9640a39d0a697ff3603e8c100300fda363658e10b23James Zern
9650a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
9660a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
9670a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
9680a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
9690a39d0a697ff3603e8c100300fda363658e10b23James Zern}
9700a39d0a697ff3603e8c100300fda363658e10b23James Zern
9710a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h2_avg_avx2(
9720a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
9730a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
9740a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[2], res0, res1;
9750a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
9760a39d0a697ff3603e8c100300fda363658e10b23James Zern
9770a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
9780a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
9790a39d0a697ff3603e8c100300fda363658e10b23James Zern
9800a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
9810a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
9820a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
9830a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16_2t_pixels(signal, &ff, &res0, &res1);
9840a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
9850a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 2;
9860a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch << 1;
9870a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch << 1;
9880a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 1);
9890a39d0a697ff3603e8c100300fda363658e10b23James Zern
9900a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (height > 0) {
9910a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x1_2t_pixels(src_ptr, signal);
9920a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8x1_2t_pixels(signal, &ff, &res0);
9930a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_avg_pixels(&res0, &max, dst_ptr);
9940a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
9950a39d0a697ff3603e8c100300fda363658e10b23James Zern}
9960a39d0a697ff3603e8c100300fda363658e10b23James Zern
9970a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h2_avg_avx2(
9980a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
9990a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
10000a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[2], res0, res1;
10010a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
10020a39d0a697ff3603e8c100300fda363658e10b23James Zern
10030a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
10040a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
10050a39d0a697ff3603e8c100300fda363658e10b23James Zern
10060a39d0a697ff3603e8c100300fda363658e10b23James Zern  src_ptr -= 3;
10070a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
10080a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x1_2t_pixels(src_ptr, signal);
10090a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16_2t_pixels(signal, &ff, &res0, &res1);
10100a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
10110a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
10120a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
10130a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
10140a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
10150a39d0a697ff3603e8c100300fda363658e10b23James Zern}
10160a39d0a697ff3603e8c100300fda363658e10b23James Zern
10170a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v2_avg_avx2(
10180a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
10190a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
10200a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i signal[3], res0, res1;
10210a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
10220a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m256i ff;
10230a39d0a697ff3603e8c100300fda363658e10b23James Zern
10240a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_2t_filter(filter, &ff);
10250a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_16x2_init(src_ptr, signal);
10260a39d0a697ff3603e8c100300fda363658e10b23James Zern
10270a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
10280a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
10290a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
10300a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
10310a39d0a697ff3603e8c100300fda363658e10b23James Zern
10320a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
10330a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
10340a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
10350a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
10360a39d0a697ff3603e8c100300fda363658e10b23James Zern}
10370a39d0a697ff3603e8c100300fda363658e10b23James Zern
10380a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v2_avg_avx2(
10390a39d0a697ff3603e8c100300fda363658e10b23James Zern    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
10400a39d0a697ff3603e8c100300fda363658e10b23James Zern    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
10410a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i signal[3], res0, res1;
10420a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
10430a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i ff;
10440a39d0a697ff3603e8c100300fda363658e10b23James Zern
10450a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x1_2t_filter(filter, &ff);
10460a39d0a697ff3603e8c100300fda363658e10b23James Zern  pack_8x2_init(src_ptr, signal);
10470a39d0a697ff3603e8c100300fda363658e10b23James Zern
10480a39d0a697ff3603e8c100300fda363658e10b23James Zern  do {
10490a39d0a697ff3603e8c100300fda363658e10b23James Zern    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
10500a39d0a697ff3603e8c100300fda363658e10b23James Zern    filter_8_2t_pixels(signal, &ff, &res0, &res1);
10510a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
10520a39d0a697ff3603e8c100300fda363658e10b23James Zern
10530a39d0a697ff3603e8c100300fda363658e10b23James Zern    src_ptr += src_pitch;
10540a39d0a697ff3603e8c100300fda363658e10b23James Zern    dst_ptr += dst_pitch;
10550a39d0a697ff3603e8c100300fda363658e10b23James Zern    height -= 1;
10560a39d0a697ff3603e8c100300fda363658e10b23James Zern  } while (height > 0);
10570a39d0a697ff3603e8c100300fda363658e10b23James Zern}
10580a39d0a697ff3603e8c100300fda363658e10b23James Zern
10590a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
10600a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        ptrdiff_t, uint32_t, const int16_t *,
10610a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        int);
10620a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
10630a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        ptrdiff_t, uint32_t, const int16_t *,
10640a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        int);
10650a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
10660a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        ptrdiff_t, uint32_t, const int16_t *,
10670a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        int);
10680a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
10690a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        ptrdiff_t, uint32_t, const int16_t *,
10700a39d0a697ff3603e8c100300fda363658e10b23James Zern                                        int);
10710a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
10720a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
10730a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
10740a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
10750a39d0a697ff3603e8c100300fda363658e10b23James Zern
10760a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
10770a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
10780a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_2D(, avx2);
10790a39d0a697ff3603e8c100300fda363658e10b23James Zern
10800a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
10810a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            uint16_t *, ptrdiff_t, uint32_t,
10820a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            const int16_t *, int);
10830a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
10840a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            uint16_t *, ptrdiff_t, uint32_t,
10850a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            const int16_t *, int);
10860a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
10870a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            uint16_t *, ptrdiff_t, uint32_t,
10880a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            const int16_t *, int);
10890a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
10900a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            uint16_t *, ptrdiff_t, uint32_t,
10910a39d0a697ff3603e8c100300fda363658e10b23James Zern                                            const int16_t *, int);
10920a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h8_avg_avx2 \
10930a39d0a697ff3603e8c100300fda363658e10b23James Zern  vpx_highbd_filter_block1d4_h8_avg_sse2
10940a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h2_avg_avx2 \
10950a39d0a697ff3603e8c100300fda363658e10b23James Zern  vpx_highbd_filter_block1d4_h2_avg_sse2
10960a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v8_avg_avx2 \
10970a39d0a697ff3603e8c100300fda363658e10b23James Zern  vpx_highbd_filter_block1d4_v8_avg_sse2
10980a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v2_avg_avx2 \
10990a39d0a697ff3603e8c100300fda363658e10b23James Zern  vpx_highbd_filter_block1d4_v2_avg_sse2
11000a39d0a697ff3603e8c100300fda363658e10b23James Zern
11010a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
11020a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
11030a39d0a697ff3603e8c100300fda363658e10b23James Zern                 avx2);
11040a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_2D(avg_, avx2);
11050a39d0a697ff3603e8c100300fda363658e10b23James Zern
11060a39d0a697ff3603e8c100300fda363658e10b23James Zern#undef HIGHBD_FUNC
1107