10a39d0a697ff3603e8c100300fda363658e10b23James Zern/* 20a39d0a697ff3603e8c100300fda363658e10b23James Zern * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 30a39d0a697ff3603e8c100300fda363658e10b23James Zern * 40a39d0a697ff3603e8c100300fda363658e10b23James Zern * Use of this source code is governed by a BSD-style license 50a39d0a697ff3603e8c100300fda363658e10b23James Zern * that can be found in the LICENSE file in the root of the source 60a39d0a697ff3603e8c100300fda363658e10b23James Zern * tree. An additional intellectual property rights grant can be found 70a39d0a697ff3603e8c100300fda363658e10b23James Zern * in the file PATENTS. All contributing project authors may 80a39d0a697ff3603e8c100300fda363658e10b23James Zern * be found in the AUTHORS file in the root of the source tree. 90a39d0a697ff3603e8c100300fda363658e10b23James Zern */ 100a39d0a697ff3603e8c100300fda363658e10b23James Zern 110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include <immintrin.h> 120a39d0a697ff3603e8c100300fda363658e10b23James Zern 130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h" 140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/convolve.h" 150a39d0a697ff3603e8c100300fda363658e10b23James Zern 160a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 170a39d0a697ff3603e8c100300fda363658e10b23James Zern// Copy and average 180a39d0a697ff3603e8c100300fda363658e10b23James Zern 190a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, 200a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *dst, ptrdiff_t dst_stride, 210a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *filter_x, int filter_x_stride, 220a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *filter_y, int filter_y_stride, 230a39d0a697ff3603e8c100300fda363658e10b23James Zern int width, int h, int bd) { 240a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_x; 250a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_y; 260a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_x_stride; 270a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_y_stride; 280a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)bd; 290a39d0a697ff3603e8c100300fda363658e10b23James Zern 300a39d0a697ff3603e8c100300fda363658e10b23James Zern assert(width % 4 == 0); 310a39d0a697ff3603e8c100300fda363658e10b23James Zern if (width > 32) { // width = 64 320a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 330a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); 340a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 350a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); 360a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); 370a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 380a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p0); 390a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 16), p1); 400a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 32), p2); 410a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 48), p3); 420a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 430a39d0a697ff3603e8c100300fda363658e10b23James Zern h--; 440a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 450a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 16) { // width = 32 460a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 470a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); 480a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 490a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 500a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p0); 510a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 16), p1); 520a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 530a39d0a697ff3603e8c100300fda363658e10b23James Zern h--; 540a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 550a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 8) { // width = 16 560a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p0, p1; 570a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 580a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm256_loadu_si256((const __m256i *)src); 590a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 600a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm256_loadu_si256((const __m256i *)src); 610a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 620a39d0a697ff3603e8c100300fda363658e10b23James Zern 630a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p0); 640a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 650a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p1); 660a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 670a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 680a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 690a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 4) { // width = 8 700a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i p0, p1; 710a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 720a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm_loadu_si128((const __m128i *)src); 730a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 740a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm_loadu_si128((const __m128i *)src); 750a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 760a39d0a697ff3603e8c100300fda363658e10b23James Zern 770a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, p0); 780a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 790a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, p1); 800a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 810a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 820a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 830a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { // width = 4 840a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i p0, p1; 850a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 860a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm_loadl_epi64((const __m128i *)src); 870a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 880a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm_loadl_epi64((const __m128i *)src); 890a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 900a39d0a697ff3603e8c100300fda363658e10b23James Zern 910a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storel_epi64((__m128i *)dst, p0); 920a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 930a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storel_epi64((__m128i *)dst, p1); 940a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 950a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 960a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 970a39d0a697ff3603e8c100300fda363658e10b23James Zern } 980a39d0a697ff3603e8c100300fda363658e10b23James Zern} 990a39d0a697ff3603e8c100300fda363658e10b23James Zern 1000a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, 1010a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *dst, ptrdiff_t dst_stride, 1020a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *filter_x, int filter_x_stride, 1030a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *filter_y, int filter_y_stride, 1040a39d0a697ff3603e8c100300fda363658e10b23James Zern int width, int h, int bd) { 1050a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_x; 1060a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_y; 1070a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_x_stride; 1080a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)filter_y_stride; 1090a39d0a697ff3603e8c100300fda363658e10b23James Zern (void)bd; 1100a39d0a697ff3603e8c100300fda363658e10b23James Zern 1110a39d0a697ff3603e8c100300fda363658e10b23James Zern assert(width % 4 == 0); 1120a39d0a697ff3603e8c100300fda363658e10b23James Zern if (width > 32) { // width = 64 1130a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p0, p1, p2, p3, u0, u1, u2, u3; 1140a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 1150a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm256_loadu_si256((const __m256i *)src); 1160a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 1170a39d0a697ff3603e8c100300fda363658e10b23James Zern p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); 1180a39d0a697ff3603e8c100300fda363658e10b23James Zern p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); 1190a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1200a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_loadu_si256((const __m256i *)dst); 1210a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); 1220a39d0a697ff3603e8c100300fda363658e10b23James Zern u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); 1230a39d0a697ff3603e8c100300fda363658e10b23James Zern u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); 1240a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 1250a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); 1260a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); 1270a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); 1280a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 1290a39d0a697ff3603e8c100300fda363658e10b23James Zern h--; 1300a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 1310a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 16) { // width = 32 1320a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p0, p1, u0, u1; 1330a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 1340a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm256_loadu_si256((const __m256i *)src); 1350a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 1360a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride; 1370a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_loadu_si256((const __m256i *)dst); 1380a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); 1390a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 1400a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); 1410a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride; 1420a39d0a697ff3603e8c100300fda363658e10b23James Zern h--; 1430a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 1440a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 8) { // width = 16 1450a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p0, p1, u0, u1; 1460a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 1470a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm256_loadu_si256((const __m256i *)src); 1480a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); 1490a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride << 1; 1500a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_loadu_si256((const __m256i *)dst); 1510a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); 1520a39d0a697ff3603e8c100300fda363658e10b23James Zern 1530a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 1540a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + dst_stride), 1550a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_avg_epu16(p1, u1)); 1560a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride << 1; 1570a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 1580a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 1590a39d0a697ff3603e8c100300fda363658e10b23James Zern } else if (width > 4) { // width = 8 1600a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i p0, p1, u0, u1; 1610a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 1620a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm_loadu_si128((const __m128i *)src); 1630a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); 1640a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride << 1; 1650a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm_loadu_si128((const __m128i *)dst); 1660a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); 1670a39d0a697ff3603e8c100300fda363658e10b23James Zern 1680a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); 1690a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); 1700a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride << 1; 1710a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 1720a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 1730a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { // width = 4 1740a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i p0, p1, u0, u1; 1750a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 1760a39d0a697ff3603e8c100300fda363658e10b23James Zern p0 = _mm_loadl_epi64((const __m128i *)src); 1770a39d0a697ff3603e8c100300fda363658e10b23James Zern p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); 1780a39d0a697ff3603e8c100300fda363658e10b23James Zern src += src_stride << 1; 1790a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm_loadl_epi64((const __m128i *)dst); 1800a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); 1810a39d0a697ff3603e8c100300fda363658e10b23James Zern 1820a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); 1830a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); 1840a39d0a697ff3603e8c100300fda363658e10b23James Zern dst += dst_stride << 1; 1850a39d0a697ff3603e8c100300fda363658e10b23James Zern h -= 2; 1860a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (h > 0); 1870a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1880a39d0a697ff3603e8c100300fda363658e10b23James Zern} 1890a39d0a697ff3603e8c100300fda363658e10b23James Zern 1900a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 1910a39d0a697ff3603e8c100300fda363658e10b23James Zern// Horizontal and vertical filtering 1920a39d0a697ff3603e8c100300fda363658e10b23James Zern 1930a39d0a697ff3603e8c100300fda363658e10b23James Zern#define CONV8_ROUNDING_BITS (7) 1940a39d0a697ff3603e8c100300fda363658e10b23James Zern 1950a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 1960a39d0a697ff3603e8c100300fda363658e10b23James Zern 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 1970a39d0a697ff3603e8c100300fda363658e10b23James Zern 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; 1980a39d0a697ff3603e8c100300fda363658e10b23James Zern 1990a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 2000a39d0a697ff3603e8c100300fda363658e10b23James Zern 8, 9, 10, 11, 10, 11, 12, 13, 2010a39d0a697ff3603e8c100300fda363658e10b23James Zern 4, 5, 6, 7, 6, 7, 8, 9, 2020a39d0a697ff3603e8c100300fda363658e10b23James Zern 8, 9, 10, 11, 10, 11, 12, 13 }; 2030a39d0a697ff3603e8c100300fda363658e10b23James Zern 2040a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, 2050a39d0a697ff3603e8c100300fda363658e10b23James Zern 10, 11, 12, 13, 12, 13, 14, 15, 2060a39d0a697ff3603e8c100300fda363658e10b23James Zern 6, 7, 8, 9, 8, 9, 10, 11, 2070a39d0a697ff3603e8c100300fda363658e10b23James Zern 10, 11, 12, 13, 12, 13, 14, 15 }; 2080a39d0a697ff3603e8c100300fda363658e10b23James Zern 2090a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; 2100a39d0a697ff3603e8c100300fda363658e10b23James Zern 2110a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 2120a39d0a697ff3603e8c100300fda363658e10b23James Zern// Horizontal Filtering 2130a39d0a697ff3603e8c100300fda363658e10b23James Zern 2140a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { 2150a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 2160a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); 2170a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); 2180a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); 2190a39d0a697ff3603e8c100300fda363658e10b23James Zern 2200a39d0a697ff3603e8c100300fda363658e10b23James Zern p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 2210a39d0a697ff3603e8c100300fda363658e10b23James Zern p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 2220a39d0a697ff3603e8c100300fda363658e10b23James Zern p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 2230a39d0a697ff3603e8c100300fda363658e10b23James Zern p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 2240a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2250a39d0a697ff3603e8c100300fda363658e10b23James Zern 2260a39d0a697ff3603e8c100300fda363658e10b23James Zern// Note: 2270a39d0a697ff3603e8c100300fda363658e10b23James Zern// Shared by 8x2 and 16x1 block 2280a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, 2290a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *x /*x[8]*/) { 2300a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i pp[8]; 2310a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_pixels(s0, pp); 2320a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_pixels(s1, &pp[4]); 2330a39d0a697ff3603e8c100300fda363658e10b23James Zern x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); 2340a39d0a697ff3603e8c100300fda363658e10b23James Zern x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); 2350a39d0a697ff3603e8c100300fda363658e10b23James Zern x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); 2360a39d0a697ff3603e8c100300fda363658e10b23James Zern x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); 2370a39d0a697ff3603e8c100300fda363658e10b23James Zern x[4] = x[2]; 2380a39d0a697ff3603e8c100300fda363658e10b23James Zern x[5] = x[3]; 2390a39d0a697ff3603e8c100300fda363658e10b23James Zern x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); 2400a39d0a697ff3603e8c100300fda363658e10b23James Zern x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); 2410a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2420a39d0a697ff3603e8c100300fda363658e10b23James Zern 2430a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { 2440a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i pp[8]; 2450a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s0; 2460a39d0a697ff3603e8c100300fda363658e10b23James Zern s0 = _mm256_loadu_si256((const __m256i *)src); 2470a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_pixels(&s0, pp); 2480a39d0a697ff3603e8c100300fda363658e10b23James Zern x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); 2490a39d0a697ff3603e8c100300fda363658e10b23James Zern x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); 2500a39d0a697ff3603e8c100300fda363658e10b23James Zern x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); 2510a39d0a697ff3603e8c100300fda363658e10b23James Zern x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); 2520a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2530a39d0a697ff3603e8c100300fda363658e10b23James Zern 2540a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, 2550a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *x) { 2560a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s0, s1; 2570a39d0a697ff3603e8c100300fda363658e10b23James Zern s0 = _mm256_loadu_si256((const __m256i *)src); 2580a39d0a697ff3603e8c100300fda363658e10b23James Zern s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); 2590a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16_pixels(&s0, &s1, x); 2600a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2610a39d0a697ff3603e8c100300fda363658e10b23James Zern 2620a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { 2630a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s0, s1; 2640a39d0a697ff3603e8c100300fda363658e10b23James Zern s0 = _mm256_loadu_si256((const __m256i *)src); 2650a39d0a697ff3603e8c100300fda363658e10b23James Zern s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 2660a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16_pixels(&s0, &s1, x); 2670a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2680a39d0a697ff3603e8c100300fda363658e10b23James Zern 2690a39d0a697ff3603e8c100300fda363658e10b23James Zern// Note: 2700a39d0a697ff3603e8c100300fda363658e10b23James Zern// Shared by horizontal and vertical filtering 2710a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { 2720a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i h = _mm_loadu_si128((const __m128i *)filter); 2730a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 2740a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p0 = _mm256_set1_epi32(0x03020100); 2750a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p1 = _mm256_set1_epi32(0x07060504); 2760a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); 2770a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); 2780a39d0a697ff3603e8c100300fda363658e10b23James Zern f[0] = _mm256_shuffle_epi8(hh, p0); 2790a39d0a697ff3603e8c100300fda363658e10b23James Zern f[1] = _mm256_shuffle_epi8(hh, p1); 2800a39d0a697ff3603e8c100300fda363658e10b23James Zern f[2] = _mm256_shuffle_epi8(hh, p2); 2810a39d0a697ff3603e8c100300fda363658e10b23James Zern f[3] = _mm256_shuffle_epi8(hh, p3); 2820a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2830a39d0a697ff3603e8c100300fda363658e10b23James Zern 2840a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, 2850a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *fil /*fil[4]*/, 2860a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y) { 2870a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i a, a0, a1; 2880a39d0a697ff3603e8c100300fda363658e10b23James Zern 2890a39d0a697ff3603e8c100300fda363658e10b23James Zern a0 = _mm256_madd_epi16(fil[0], sig[0]); 2900a39d0a697ff3603e8c100300fda363658e10b23James Zern a1 = _mm256_madd_epi16(fil[3], sig[3]); 2910a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_add_epi32(a0, a1); 2920a39d0a697ff3603e8c100300fda363658e10b23James Zern 2930a39d0a697ff3603e8c100300fda363658e10b23James Zern a0 = _mm256_madd_epi16(fil[1], sig[1]); 2940a39d0a697ff3603e8c100300fda363658e10b23James Zern a1 = _mm256_madd_epi16(fil[2], sig[2]); 2950a39d0a697ff3603e8c100300fda363658e10b23James Zern 2960a39d0a697ff3603e8c100300fda363658e10b23James Zern { 2970a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i min = _mm256_min_epi32(a0, a1); 2980a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_add_epi32(a, min); 2990a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3000a39d0a697ff3603e8c100300fda363658e10b23James Zern { 3010a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_max_epi32(a0, a1); 3020a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_add_epi32(a, max); 3030a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3040a39d0a697ff3603e8c100300fda363658e10b23James Zern { 3050a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 3060a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_add_epi32(a, rounding); 3070a39d0a697ff3603e8c100300fda363658e10b23James Zern *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); 3080a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3090a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3100a39d0a697ff3603e8c100300fda363658e10b23James Zern 3110a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, 3120a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *dst) { 3130a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i a0 = _mm256_castsi256_si128(*y); 3140a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i a1 = _mm256_extractf128_si256(*y, 1); 3150a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i res = _mm_packus_epi32(a0, a1); 3160a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); 3170a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, res); 3180a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3190a39d0a697ff3603e8c100300fda363658e10b23James Zern 3200a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, 3210a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst, 3220a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t pitch) { 3230a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i a = _mm256_packus_epi32(*y0, *y1); 3240a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_min_epi16(a, *mask); 3250a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); 3260a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); 3270a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3280a39d0a697ff3603e8c100300fda363658e10b23James Zern 3290a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, 3300a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst) { 3310a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i a = _mm256_packus_epi32(*y0, *y1); 3320a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_min_epi16(a, *mask); 3330a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, a); 3340a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3350a39d0a697ff3603e8c100300fda363658e10b23James Zern 3360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h8_avx2( 3370a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 3380a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 3390a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[8], res0, res1; 3400a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 3410a39d0a697ff3603e8c100300fda363658e10b23James Zern 3420a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 3430a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 3440a39d0a697ff3603e8c100300fda363658e10b23James Zern 3450a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 3460a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 3470a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_pixels(src_ptr, src_pitch, signal); 3480a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 3490a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&signal[4], ff, &res1); 3500a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 3510a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 3520a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 3530a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 3540a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 1); 3550a39d0a697ff3603e8c100300fda363658e10b23James Zern 3560a39d0a697ff3603e8c100300fda363658e10b23James Zern if (height > 0) { 3570a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_pixels(src_ptr, signal); 3580a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 3590a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_pixels(&res0, &max, dst_ptr); 3600a39d0a697ff3603e8c100300fda363658e10b23James Zern } 3610a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3620a39d0a697ff3603e8c100300fda363658e10b23James Zern 3630a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h8_avx2( 3640a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 3650a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 3660a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[8], res0, res1; 3670a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 3680a39d0a697ff3603e8c100300fda363658e10b23James Zern 3690a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 3700a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 3710a39d0a697ff3603e8c100300fda363658e10b23James Zern 3720a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 3730a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 3740a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x1_pixels(src_ptr, signal); 3750a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 3760a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&signal[4], ff, &res1); 3770a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_pixels(&res0, &res1, &max, dst_ptr); 3780a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 3790a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 3800a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 3810a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 3820a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3830a39d0a697ff3603e8c100300fda363658e10b23James Zern 3840a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 3850a39d0a697ff3603e8c100300fda363658e10b23James Zern// 2-tap horizontal filtering 3860a39d0a697ff3603e8c100300fda363658e10b23James Zern 3870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { 3880a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i h = _mm_loadu_si128((const __m128i *)filter); 3890a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 3900a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i p = _mm256_set1_epi32(0x09080706); 3910a39d0a697ff3603e8c100300fda363658e10b23James Zern f[0] = _mm256_shuffle_epi8(hh, p); 3920a39d0a697ff3603e8c100300fda363658e10b23James Zern} 3930a39d0a697ff3603e8c100300fda363658e10b23James Zern 3940a39d0a697ff3603e8c100300fda363658e10b23James Zern// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() 3950a39d0a697ff3603e8c100300fda363658e10b23James Zern// the difference is s0/s1 specifies first and second rows or, 3960a39d0a697ff3603e8c100300fda363658e10b23James Zern// first 16 samples and 8-sample shifted 16 samples 3970a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, 3980a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig) { 3990a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 4000a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 4010a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); 4020a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); 4030a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); 4040a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); 4050a39d0a697ff3603e8c100300fda363658e10b23James Zern r0 = _mm256_shuffle_epi8(r0, sf2); 4060a39d0a697ff3603e8c100300fda363658e10b23James Zern r1 = _mm256_shuffle_epi8(r1, sf2); 4070a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); 4080a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); 4090a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4100a39d0a697ff3603e8c100300fda363658e10b23James Zern 4110a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_2t_pixels(const uint16_t *src, 4120a39d0a697ff3603e8c100300fda363658e10b23James Zern const ptrdiff_t pitch, __m256i *sig) { 4130a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 4140a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 4150a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16_2t_pixels(&r0, &r1, sig); 4160a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4170a39d0a697ff3603e8c100300fda363658e10b23James Zern 4180a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x1_2t_pixels(const uint16_t *src, 4190a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig /*sig[2]*/) { 4200a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 4210a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 4220a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16_2t_pixels(&r0, &r1, sig); 4230a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4240a39d0a697ff3603e8c100300fda363658e10b23James Zern 4250a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_2t_pixels(const uint16_t *src, 4260a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig /*sig[2]*/) { 4270a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 4280a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 4290a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 4300a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x0 = _mm256_shuffle_epi8(r0, sf2); 4310a39d0a697ff3603e8c100300fda363658e10b23James Zern r0 = _mm256_permutevar8x32_epi32(r0, idx); 4320a39d0a697ff3603e8c100300fda363658e10b23James Zern r0 = _mm256_shuffle_epi8(r0, sf2); 4330a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); 4340a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4350a39d0a697ff3603e8c100300fda363658e10b23James Zern 4360a39d0a697ff3603e8c100300fda363658e10b23James Zern// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() 4370a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, 4380a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y0, __m256i *y1) { 4390a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 4400a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x0 = _mm256_madd_epi16(sig[0], *f); 4410a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x1 = _mm256_madd_epi16(sig[1], *f); 4420a39d0a697ff3603e8c100300fda363658e10b23James Zern x0 = _mm256_add_epi32(x0, rounding); 4430a39d0a697ff3603e8c100300fda363658e10b23James Zern x1 = _mm256_add_epi32(x1, rounding); 4440a39d0a697ff3603e8c100300fda363658e10b23James Zern *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 4450a39d0a697ff3603e8c100300fda363658e10b23James Zern *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); 4460a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4470a39d0a697ff3603e8c100300fda363658e10b23James Zern 4480a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, 4490a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y0) { 4500a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 4510a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i x0 = _mm256_madd_epi16(sig[0], *f); 4520a39d0a697ff3603e8c100300fda363658e10b23James Zern x0 = _mm256_add_epi32(x0, rounding); 4530a39d0a697ff3603e8c100300fda363658e10b23James Zern *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 4540a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4550a39d0a697ff3603e8c100300fda363658e10b23James Zern 4560a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h2_avx2( 4570a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 4580a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 4590a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[2], res0, res1; 4600a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 4610a39d0a697ff3603e8c100300fda363658e10b23James Zern 4620a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 4630a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 4640a39d0a697ff3603e8c100300fda363658e10b23James Zern 4650a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 4660a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 4670a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_2t_pixels(src_ptr, src_pitch, signal); 4680a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16_2t_pixels(signal, &ff, &res0, &res1); 4690a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 4700a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 4710a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 4720a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 4730a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 1); 4740a39d0a697ff3603e8c100300fda363658e10b23James Zern 4750a39d0a697ff3603e8c100300fda363658e10b23James Zern if (height > 0) { 4760a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_2t_pixels(src_ptr, signal); 4770a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_2t_pixels(signal, &ff, &res0); 4780a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_pixels(&res0, &max, dst_ptr); 4790a39d0a697ff3603e8c100300fda363658e10b23James Zern } 4800a39d0a697ff3603e8c100300fda363658e10b23James Zern} 4810a39d0a697ff3603e8c100300fda363658e10b23James Zern 4820a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h2_avx2( 4830a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 4840a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 4850a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[2], res0, res1; 4860a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 4870a39d0a697ff3603e8c100300fda363658e10b23James Zern 4880a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 4890a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 4900a39d0a697ff3603e8c100300fda363658e10b23James Zern 4910a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 4920a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 4930a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x1_2t_pixels(src_ptr, signal); 4940a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16_2t_pixels(signal, &ff, &res0, &res1); 4950a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_pixels(&res0, &res1, &max, dst_ptr); 4960a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 4970a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 4980a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 4990a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 5000a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5010a39d0a697ff3603e8c100300fda363658e10b23James Zern 5020a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 5030a39d0a697ff3603e8c100300fda363658e10b23James Zern// Vertical Filtering 5040a39d0a697ff3603e8c100300fda363658e10b23James Zern 5050a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 5060a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); 5070a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s1 = 5080a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); 5090a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s2 = _mm256_castsi128_si256( 5100a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); 5110a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s3 = _mm256_castsi128_si256( 5120a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); 5130a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s4 = _mm256_castsi128_si256( 5140a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); 5150a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s5 = _mm256_castsi128_si256( 5160a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); 5170a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s6 = _mm256_castsi128_si256( 5180a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); 5190a39d0a697ff3603e8c100300fda363658e10b23James Zern 5200a39d0a697ff3603e8c100300fda363658e10b23James Zern s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 5210a39d0a697ff3603e8c100300fda363658e10b23James Zern s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); 5220a39d0a697ff3603e8c100300fda363658e10b23James Zern s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); 5230a39d0a697ff3603e8c100300fda363658e10b23James Zern s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); 5240a39d0a697ff3603e8c100300fda363658e10b23James Zern s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); 5250a39d0a697ff3603e8c100300fda363658e10b23James Zern s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); 5260a39d0a697ff3603e8c100300fda363658e10b23James Zern 5270a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm256_unpacklo_epi16(s0, s1); 5280a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[4] = _mm256_unpackhi_epi16(s0, s1); 5290a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[1] = _mm256_unpacklo_epi16(s2, s3); 5300a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[5] = _mm256_unpackhi_epi16(s2, s3); 5310a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = _mm256_unpacklo_epi16(s4, s5); 5320a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[6] = _mm256_unpackhi_epi16(s4, s5); 5330a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[8] = s6; 5340a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5350a39d0a697ff3603e8c100300fda363658e10b23James Zern 5360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, 5370a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig) { 5380a39d0a697ff3603e8c100300fda363658e10b23James Zern // base + 7th row 5390a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s0 = _mm256_castsi128_si256( 5400a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); 5410a39d0a697ff3603e8c100300fda363658e10b23James Zern // base + 8th row 5420a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s1 = _mm256_castsi128_si256( 5430a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); 5440a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); 5450a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 5460a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[3] = _mm256_unpacklo_epi16(s2, s3); 5470a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[7] = _mm256_unpackhi_epi16(s2, s3); 5480a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[8] = s1; 5490a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5500a39d0a697ff3603e8c100300fda363658e10b23James Zern 5510a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, 5520a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y0, __m256i *y1) { 5530a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(sig, f, y0); 5540a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&sig[4], f, y1); 5550a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5560a39d0a697ff3603e8c100300fda363658e10b23James Zern 5570a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void update_pixels(__m256i *sig) { 5580a39d0a697ff3603e8c100300fda363658e10b23James Zern int i; 5590a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 3; ++i) { 5600a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[i] = sig[i + 1]; 5610a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[i + 4] = sig[i + 5]; 5620a39d0a697ff3603e8c100300fda363658e10b23James Zern } 5630a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5640a39d0a697ff3603e8c100300fda363658e10b23James Zern 5650a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v8_avx2( 5660a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 5670a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 5680a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[9], res0, res1; 5690a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 5700a39d0a697ff3603e8c100300fda363658e10b23James Zern 5710a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 5720a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 5730a39d0a697ff3603e8c100300fda363658e10b23James Zern 5740a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x9_init(src_ptr, src_pitch, signal); 5750a39d0a697ff3603e8c100300fda363658e10b23James Zern 5760a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 5770a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x9_pixels(src_ptr, src_pitch, signal); 5780a39d0a697ff3603e8c100300fda363658e10b23James Zern 5790a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x9_pixels(signal, ff, &res0, &res1); 5800a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 5810a39d0a697ff3603e8c100300fda363658e10b23James Zern update_pixels(signal); 5820a39d0a697ff3603e8c100300fda363658e10b23James Zern 5830a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 5840a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 5850a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 5860a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 5870a39d0a697ff3603e8c100300fda363658e10b23James Zern} 5880a39d0a697ff3603e8c100300fda363658e10b23James Zern 5890a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 5900a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i u0, u1, u2, u3; 5910a39d0a697ff3603e8c100300fda363658e10b23James Zern // load 0-6 rows 5920a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); 5930a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 5940a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); 5950a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); 5960a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); 5970a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); 5980a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); 5990a39d0a697ff3603e8c100300fda363658e10b23James Zern 6000a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low 6010a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high 6020a39d0a697ff3603e8c100300fda363658e10b23James Zern 6030a39d0a697ff3603e8c100300fda363658e10b23James Zern u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low 6040a39d0a697ff3603e8c100300fda363658e10b23James Zern u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high 6050a39d0a697ff3603e8c100300fda363658e10b23James Zern 6060a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm256_unpacklo_epi16(u0, u2); 6070a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[4] = _mm256_unpackhi_epi16(u0, u2); 6080a39d0a697ff3603e8c100300fda363658e10b23James Zern 6090a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[8] = _mm256_unpacklo_epi16(u1, u3); 6100a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[12] = _mm256_unpackhi_epi16(u1, u3); 6110a39d0a697ff3603e8c100300fda363658e10b23James Zern 6120a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_permute2x128_si256(s2, s3, 0x20); 6130a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_permute2x128_si256(s2, s3, 0x31); 6140a39d0a697ff3603e8c100300fda363658e10b23James Zern 6150a39d0a697ff3603e8c100300fda363658e10b23James Zern u2 = _mm256_permute2x128_si256(s3, s4, 0x20); 6160a39d0a697ff3603e8c100300fda363658e10b23James Zern u3 = _mm256_permute2x128_si256(s3, s4, 0x31); 6170a39d0a697ff3603e8c100300fda363658e10b23James Zern 6180a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[1] = _mm256_unpacklo_epi16(u0, u2); 6190a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[5] = _mm256_unpackhi_epi16(u0, u2); 6200a39d0a697ff3603e8c100300fda363658e10b23James Zern 6210a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[9] = _mm256_unpacklo_epi16(u1, u3); 6220a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[13] = _mm256_unpackhi_epi16(u1, u3); 6230a39d0a697ff3603e8c100300fda363658e10b23James Zern 6240a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_permute2x128_si256(s4, s5, 0x20); 6250a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_permute2x128_si256(s4, s5, 0x31); 6260a39d0a697ff3603e8c100300fda363658e10b23James Zern 6270a39d0a697ff3603e8c100300fda363658e10b23James Zern u2 = _mm256_permute2x128_si256(s5, s6, 0x20); 6280a39d0a697ff3603e8c100300fda363658e10b23James Zern u3 = _mm256_permute2x128_si256(s5, s6, 0x31); 6290a39d0a697ff3603e8c100300fda363658e10b23James Zern 6300a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = _mm256_unpacklo_epi16(u0, u2); 6310a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[6] = _mm256_unpackhi_epi16(u0, u2); 6320a39d0a697ff3603e8c100300fda363658e10b23James Zern 6330a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[10] = _mm256_unpacklo_epi16(u1, u3); 6340a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[14] = _mm256_unpackhi_epi16(u1, u3); 6350a39d0a697ff3603e8c100300fda363658e10b23James Zern 6360a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[16] = s6; 6370a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6380a39d0a697ff3603e8c100300fda363658e10b23James Zern 6390a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, 6400a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig) { 6410a39d0a697ff3603e8c100300fda363658e10b23James Zern // base + 7th row 6420a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); 6430a39d0a697ff3603e8c100300fda363658e10b23James Zern // base + 8th row 6440a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); 6450a39d0a697ff3603e8c100300fda363658e10b23James Zern 6460a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i u0, u1, u2, u3; 6470a39d0a697ff3603e8c100300fda363658e10b23James Zern u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); 6480a39d0a697ff3603e8c100300fda363658e10b23James Zern u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); 6490a39d0a697ff3603e8c100300fda363658e10b23James Zern 6500a39d0a697ff3603e8c100300fda363658e10b23James Zern u2 = _mm256_permute2x128_si256(s7, s8, 0x20); 6510a39d0a697ff3603e8c100300fda363658e10b23James Zern u3 = _mm256_permute2x128_si256(s7, s8, 0x31); 6520a39d0a697ff3603e8c100300fda363658e10b23James Zern 6530a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[3] = _mm256_unpacklo_epi16(u0, u2); 6540a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[7] = _mm256_unpackhi_epi16(u0, u2); 6550a39d0a697ff3603e8c100300fda363658e10b23James Zern 6560a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[11] = _mm256_unpacklo_epi16(u1, u3); 6570a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[15] = _mm256_unpackhi_epi16(u1, u3); 6580a39d0a697ff3603e8c100300fda363658e10b23James Zern 6590a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[16] = s8; 6600a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6610a39d0a697ff3603e8c100300fda363658e10b23James Zern 6620a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, 6630a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y0, __m256i *y1) { 6640a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i res[4]; 6650a39d0a697ff3603e8c100300fda363658e10b23James Zern int i; 6660a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 4; ++i) { 6670a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&sig[i << 2], f, &res[i]); 6680a39d0a697ff3603e8c100300fda363658e10b23James Zern } 6690a39d0a697ff3603e8c100300fda363658e10b23James Zern 6700a39d0a697ff3603e8c100300fda363658e10b23James Zern { 6710a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); 6720a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); 6730a39d0a697ff3603e8c100300fda363658e10b23James Zern *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); 6740a39d0a697ff3603e8c100300fda363658e10b23James Zern *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); 6750a39d0a697ff3603e8c100300fda363658e10b23James Zern } 6760a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6770a39d0a697ff3603e8c100300fda363658e10b23James Zern 6780a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, 6790a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst, 6800a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t pitch) { 6810a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p = _mm256_min_epi16(*y0, *mask); 6820a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p); 6830a39d0a697ff3603e8c100300fda363658e10b23James Zern p = _mm256_min_epi16(*y1, *mask); 6840a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + pitch), p); 6850a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6860a39d0a697ff3603e8c100300fda363658e10b23James Zern 6870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void update_16x9_pixels(__m256i *sig) { 6880a39d0a697ff3603e8c100300fda363658e10b23James Zern update_pixels(&sig[0]); 6890a39d0a697ff3603e8c100300fda363658e10b23James Zern update_pixels(&sig[8]); 6900a39d0a697ff3603e8c100300fda363658e10b23James Zern} 6910a39d0a697ff3603e8c100300fda363658e10b23James Zern 6920a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v8_avx2( 6930a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 6940a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 6950a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[17], res0, res1; 6960a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 6970a39d0a697ff3603e8c100300fda363658e10b23James Zern 6980a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 6990a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 7000a39d0a697ff3603e8c100300fda363658e10b23James Zern 7010a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x9_init(src_ptr, src_pitch, signal); 7020a39d0a697ff3603e8c100300fda363658e10b23James Zern 7030a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 7040a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x9_pixels(src_ptr, src_pitch, signal); 7050a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16x9_pixels(signal, ff, &res0, &res1); 7060a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 7070a39d0a697ff3603e8c100300fda363658e10b23James Zern update_16x9_pixels(signal); 7080a39d0a697ff3603e8c100300fda363658e10b23James Zern 7090a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 7100a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 7110a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 7120a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 7130a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7140a39d0a697ff3603e8c100300fda363658e10b23James Zern 7150a39d0a697ff3603e8c100300fda363658e10b23James Zern// ----------------------------------------------------------------------------- 7160a39d0a697ff3603e8c100300fda363658e10b23James Zern// 2-tap vertical filtering 7170a39d0a697ff3603e8c100300fda363658e10b23James Zern 7180a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_16x2_init(const uint16_t *src, __m256i *sig) { 7190a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = _mm256_loadu_si256((const __m256i *)src); 7200a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7210a39d0a697ff3603e8c100300fda363658e10b23James Zern 7220a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, 7230a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *sig) { 7240a39d0a697ff3603e8c100300fda363658e10b23James Zern // load the next row 7250a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); 7260a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm256_unpacklo_epi16(sig[2], u); 7270a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[1] = _mm256_unpackhi_epi16(sig[2], u); 7280a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = u; 7290a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7300a39d0a697ff3603e8c100300fda363658e10b23James Zern 7310a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, 7320a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i *y0, __m256i *y1) { 7330a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16_2t_pixels(sig, f, y0, y1); 7340a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7350a39d0a697ff3603e8c100300fda363658e10b23James Zern 7360a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v2_avx2( 7370a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 7380a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 7390a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[3], res0, res1; 7400a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 7410a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 7420a39d0a697ff3603e8c100300fda363658e10b23James Zern 7430a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 7440a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x2_init(src_ptr, signal); 7450a39d0a697ff3603e8c100300fda363658e10b23James Zern 7460a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 7470a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x2_2t_pixels(src_ptr, src_pitch, signal); 7480a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16x2_2t_pixels(signal, &ff, &res0, &res1); 7490a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_pixels(&res0, &res1, &max, dst_ptr); 7500a39d0a697ff3603e8c100300fda363658e10b23James Zern 7510a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 7520a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 7530a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 7540a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 7550a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7560a39d0a697ff3603e8c100300fda363658e10b23James Zern 7570a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { 7580a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i h = _mm_loadu_si128((const __m128i *)filter); 7590a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i p = _mm_set1_epi32(0x09080706); 7600a39d0a697ff3603e8c100300fda363658e10b23James Zern f[0] = _mm_shuffle_epi8(h, p); 7610a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7620a39d0a697ff3603e8c100300fda363658e10b23James Zern 7630a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void pack_8x2_init(const uint16_t *src, __m128i *sig) { 7640a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = _mm_loadu_si128((const __m128i *)src); 7650a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7660a39d0a697ff3603e8c100300fda363658e10b23James Zern 7670a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, 7680a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i *sig) { 7690a39d0a697ff3603e8c100300fda363658e10b23James Zern // load the next row 7700a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); 7710a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[0] = _mm_unpacklo_epi16(sig[2], u); 7720a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[1] = _mm_unpackhi_epi16(sig[2], u); 7730a39d0a697ff3603e8c100300fda363658e10b23James Zern sig[2] = u; 7740a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7750a39d0a697ff3603e8c100300fda363658e10b23James Zern 7760a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, 7770a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i *y0, __m128i *y1) { 7780a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 7790a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i x0 = _mm_madd_epi16(sig[0], *f); 7800a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i x1 = _mm_madd_epi16(sig[1], *f); 7810a39d0a697ff3603e8c100300fda363658e10b23James Zern x0 = _mm_add_epi32(x0, rounding); 7820a39d0a697ff3603e8c100300fda363658e10b23James Zern x1 = _mm_add_epi32(x1, rounding); 7830a39d0a697ff3603e8c100300fda363658e10b23James Zern *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); 7840a39d0a697ff3603e8c100300fda363658e10b23James Zern *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); 7850a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7860a39d0a697ff3603e8c100300fda363658e10b23James Zern 7870a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, 7880a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i *mask, uint16_t *dst) { 7890a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i res = _mm_packus_epi32(*y0, *y1); 7900a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_min_epi16(res, *mask); 7910a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, res); 7920a39d0a697ff3603e8c100300fda363658e10b23James Zern} 7930a39d0a697ff3603e8c100300fda363658e10b23James Zern 7940a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v2_avx2( 7950a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 7960a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 7970a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i signal[3], res0, res1; 7980a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i max = _mm_set1_epi16((1 << bd) - 1); 7990a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i ff; 8000a39d0a697ff3603e8c100300fda363658e10b23James Zern 8010a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_2t_filter(filter, &ff); 8020a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_init(src_ptr, signal); 8030a39d0a697ff3603e8c100300fda363658e10b23James Zern 8040a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 8050a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); 8060a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8_2t_pixels(signal, &ff, &res0, &res1); 8070a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); 8080a39d0a697ff3603e8c100300fda363658e10b23James Zern 8090a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 8100a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 8110a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 8120a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 8130a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8140a39d0a697ff3603e8c100300fda363658e10b23James Zern 8150a39d0a697ff3603e8c100300fda363658e10b23James Zern// Calculation with averaging the input pixels 8160a39d0a697ff3603e8c100300fda363658e10b23James Zern 8170a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, 8180a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *dst) { 8190a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i a0 = _mm256_castsi256_si128(*y0); 8200a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i a1 = _mm256_extractf128_si256(*y0, 1); 8210a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i res = _mm_packus_epi32(a0, a1); 8220a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i pix = _mm_loadu_si128((const __m128i *)dst); 8230a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); 8240a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_avg_epu16(res, pix); 8250a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, res); 8260a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8270a39d0a697ff3603e8c100300fda363658e10b23James Zern 8280a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, 8290a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst, 8300a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t pitch) { 8310a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i a = _mm256_packus_epi32(*y0, *y1); 8320a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); 8330a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); 8340a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i pix = 8350a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); 8360a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_min_epi16(a, *mask); 8370a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_avg_epu16(a, pix); 8380a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); 8390a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); 8400a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8410a39d0a697ff3603e8c100300fda363658e10b23James Zern 8420a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, 8430a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst) { 8440a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i a = _mm256_packus_epi32(*y0, *y1); 8450a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); 8460a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_min_epi16(a, *mask); 8470a39d0a697ff3603e8c100300fda363658e10b23James Zern a = _mm256_avg_epu16(a, pix); 8480a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, a); 8490a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8500a39d0a697ff3603e8c100300fda363658e10b23James Zern 8510a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, 8520a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i *mask, uint16_t *dst, 8530a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t pitch) { 8540a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); 8550a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); 8560a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i p = _mm256_min_epi16(*y0, *mask); 8570a39d0a697ff3603e8c100300fda363658e10b23James Zern p = _mm256_avg_epu16(p, pix0); 8580a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)dst, p); 8590a39d0a697ff3603e8c100300fda363658e10b23James Zern 8600a39d0a697ff3603e8c100300fda363658e10b23James Zern p = _mm256_min_epi16(*y1, *mask); 8610a39d0a697ff3603e8c100300fda363658e10b23James Zern p = _mm256_avg_epu16(p, pix1); 8620a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm256_storeu_si256((__m256i *)(dst + pitch), p); 8630a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8640a39d0a697ff3603e8c100300fda363658e10b23James Zern 8650a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, 8660a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i *y1, 8670a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i *mask, 8680a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *dst) { 8690a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i res = _mm_packus_epi32(*y0, *y1); 8700a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i pix = _mm_loadu_si128((const __m128i *)dst); 8710a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_min_epi16(res, *mask); 8720a39d0a697ff3603e8c100300fda363658e10b23James Zern res = _mm_avg_epu16(res, pix); 8730a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)dst, res); 8740a39d0a697ff3603e8c100300fda363658e10b23James Zern} 8750a39d0a697ff3603e8c100300fda363658e10b23James Zern 8760a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h8_avg_avx2( 8770a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 8780a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 8790a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[8], res0, res1; 8800a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 8810a39d0a697ff3603e8c100300fda363658e10b23James Zern 8820a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 8830a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 8840a39d0a697ff3603e8c100300fda363658e10b23James Zern 8850a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 8860a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 8870a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_pixels(src_ptr, src_pitch, signal); 8880a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 8890a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&signal[4], ff, &res1); 8900a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 8910a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 8920a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 8930a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 8940a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 1); 8950a39d0a697ff3603e8c100300fda363658e10b23James Zern 8960a39d0a697ff3603e8c100300fda363658e10b23James Zern if (height > 0) { 8970a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_pixels(src_ptr, signal); 8980a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 8990a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_avg_pixels(&res0, &max, dst_ptr); 9000a39d0a697ff3603e8c100300fda363658e10b23James Zern } 9010a39d0a697ff3603e8c100300fda363658e10b23James Zern} 9020a39d0a697ff3603e8c100300fda363658e10b23James Zern 9030a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h8_avg_avx2( 9040a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 9050a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 9060a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[8], res0, res1; 9070a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 9080a39d0a697ff3603e8c100300fda363658e10b23James Zern 9090a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 9100a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 9110a39d0a697ff3603e8c100300fda363658e10b23James Zern 9120a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 9130a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 9140a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x1_pixels(src_ptr, signal); 9150a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(signal, ff, &res0); 9160a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_pixels(&signal[4], ff, &res1); 9170a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 9180a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 9190a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 9200a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 9210a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 9220a39d0a697ff3603e8c100300fda363658e10b23James Zern} 9230a39d0a697ff3603e8c100300fda363658e10b23James Zern 9240a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v8_avg_avx2( 9250a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 9260a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 9270a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[9], res0, res1; 9280a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 9290a39d0a697ff3603e8c100300fda363658e10b23James Zern 9300a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 9310a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 9320a39d0a697ff3603e8c100300fda363658e10b23James Zern 9330a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x9_init(src_ptr, src_pitch, signal); 9340a39d0a697ff3603e8c100300fda363658e10b23James Zern 9350a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 9360a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x9_pixels(src_ptr, src_pitch, signal); 9370a39d0a697ff3603e8c100300fda363658e10b23James Zern 9380a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x9_pixels(signal, ff, &res0, &res1); 9390a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 9400a39d0a697ff3603e8c100300fda363658e10b23James Zern update_pixels(signal); 9410a39d0a697ff3603e8c100300fda363658e10b23James Zern 9420a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 9430a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 9440a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 9450a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 9460a39d0a697ff3603e8c100300fda363658e10b23James Zern} 9470a39d0a697ff3603e8c100300fda363658e10b23James Zern 9480a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v8_avg_avx2( 9490a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 9500a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 9510a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[17], res0, res1; 9520a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 9530a39d0a697ff3603e8c100300fda363658e10b23James Zern 9540a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff[4]; 9550a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_filters(filter, ff); 9560a39d0a697ff3603e8c100300fda363658e10b23James Zern 9570a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x9_init(src_ptr, src_pitch, signal); 9580a39d0a697ff3603e8c100300fda363658e10b23James Zern 9590a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 9600a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x9_pixels(src_ptr, src_pitch, signal); 9610a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16x9_pixels(signal, ff, &res0, &res1); 9620a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 9630a39d0a697ff3603e8c100300fda363658e10b23James Zern update_16x9_pixels(signal); 9640a39d0a697ff3603e8c100300fda363658e10b23James Zern 9650a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 9660a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 9670a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 9680a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 9690a39d0a697ff3603e8c100300fda363658e10b23James Zern} 9700a39d0a697ff3603e8c100300fda363658e10b23James Zern 9710a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_h2_avg_avx2( 9720a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 9730a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 9740a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[2], res0, res1; 9750a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 9760a39d0a697ff3603e8c100300fda363658e10b23James Zern 9770a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 9780a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 9790a39d0a697ff3603e8c100300fda363658e10b23James Zern 9800a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 9810a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 9820a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_2t_pixels(src_ptr, src_pitch, signal); 9830a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16_2t_pixels(signal, &ff, &res0, &res1); 9840a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 9850a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 2; 9860a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch << 1; 9870a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch << 1; 9880a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 1); 9890a39d0a697ff3603e8c100300fda363658e10b23James Zern 9900a39d0a697ff3603e8c100300fda363658e10b23James Zern if (height > 0) { 9910a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_2t_pixels(src_ptr, signal); 9920a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8x1_2t_pixels(signal, &ff, &res0); 9930a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_avg_pixels(&res0, &max, dst_ptr); 9940a39d0a697ff3603e8c100300fda363658e10b23James Zern } 9950a39d0a697ff3603e8c100300fda363658e10b23James Zern} 9960a39d0a697ff3603e8c100300fda363658e10b23James Zern 9970a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_h2_avg_avx2( 9980a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 9990a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 10000a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[2], res0, res1; 10010a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 10020a39d0a697ff3603e8c100300fda363658e10b23James Zern 10030a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 10040a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 10050a39d0a697ff3603e8c100300fda363658e10b23James Zern 10060a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr -= 3; 10070a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 10080a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x1_2t_pixels(src_ptr, signal); 10090a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16_2t_pixels(signal, &ff, &res0, &res1); 10100a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 10110a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 10120a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 10130a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 10140a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 10150a39d0a697ff3603e8c100300fda363658e10b23James Zern} 10160a39d0a697ff3603e8c100300fda363658e10b23James Zern 10170a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d16_v2_avg_avx2( 10180a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 10190a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 10200a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i signal[3], res0, res1; 10210a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 10220a39d0a697ff3603e8c100300fda363658e10b23James Zern __m256i ff; 10230a39d0a697ff3603e8c100300fda363658e10b23James Zern 10240a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_2t_filter(filter, &ff); 10250a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x2_init(src_ptr, signal); 10260a39d0a697ff3603e8c100300fda363658e10b23James Zern 10270a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 10280a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_16x2_2t_pixels(src_ptr, src_pitch, signal); 10290a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_16x2_2t_pixels(signal, &ff, &res0, &res1); 10300a39d0a697ff3603e8c100300fda363658e10b23James Zern store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 10310a39d0a697ff3603e8c100300fda363658e10b23James Zern 10320a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 10330a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 10340a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 10350a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 10360a39d0a697ff3603e8c100300fda363658e10b23James Zern} 10370a39d0a697ff3603e8c100300fda363658e10b23James Zern 10380a39d0a697ff3603e8c100300fda363658e10b23James Zernstatic void vpx_highbd_filter_block1d8_v2_avg_avx2( 10390a39d0a697ff3603e8c100300fda363658e10b23James Zern const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 10400a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 10410a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i signal[3], res0, res1; 10420a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i max = _mm_set1_epi16((1 << bd) - 1); 10430a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i ff; 10440a39d0a697ff3603e8c100300fda363658e10b23James Zern 10450a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x1_2t_filter(filter, &ff); 10460a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_init(src_ptr, signal); 10470a39d0a697ff3603e8c100300fda363658e10b23James Zern 10480a39d0a697ff3603e8c100300fda363658e10b23James Zern do { 10490a39d0a697ff3603e8c100300fda363658e10b23James Zern pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); 10500a39d0a697ff3603e8c100300fda363658e10b23James Zern filter_8_2t_pixels(signal, &ff, &res0, &res1); 10510a39d0a697ff3603e8c100300fda363658e10b23James Zern store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); 10520a39d0a697ff3603e8c100300fda363658e10b23James Zern 10530a39d0a697ff3603e8c100300fda363658e10b23James Zern src_ptr += src_pitch; 10540a39d0a697ff3603e8c100300fda363658e10b23James Zern dst_ptr += dst_pitch; 10550a39d0a697ff3603e8c100300fda363658e10b23James Zern height -= 1; 10560a39d0a697ff3603e8c100300fda363658e10b23James Zern } while (height > 0); 10570a39d0a697ff3603e8c100300fda363658e10b23James Zern} 10580a39d0a697ff3603e8c100300fda363658e10b23James Zern 10590a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 10600a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t, uint32_t, const int16_t *, 10610a39d0a697ff3603e8c100300fda363658e10b23James Zern int); 10620a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 10630a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t, uint32_t, const int16_t *, 10640a39d0a697ff3603e8c100300fda363658e10b23James Zern int); 10650a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 10660a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t, uint32_t, const int16_t *, 10670a39d0a697ff3603e8c100300fda363658e10b23James Zern int); 10680a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 10690a39d0a697ff3603e8c100300fda363658e10b23James Zern ptrdiff_t, uint32_t, const int16_t *, 10700a39d0a697ff3603e8c100300fda363658e10b23James Zern int); 10710a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 10720a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 10730a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 10740a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 10750a39d0a697ff3603e8c100300fda363658e10b23James Zern 10760a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); 10770a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); 10780a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_2D(, avx2); 10790a39d0a697ff3603e8c100300fda363658e10b23James Zern 10800a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, 10810a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *, ptrdiff_t, uint32_t, 10820a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *, int); 10830a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, 10840a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *, ptrdiff_t, uint32_t, 10850a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *, int); 10860a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, 10870a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *, ptrdiff_t, uint32_t, 10880a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *, int); 10890a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, 10900a39d0a697ff3603e8c100300fda363658e10b23James Zern uint16_t *, ptrdiff_t, uint32_t, 10910a39d0a697ff3603e8c100300fda363658e10b23James Zern const int16_t *, int); 10920a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ 10930a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_filter_block1d4_h8_avg_sse2 10940a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ 10950a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_filter_block1d4_h2_avg_sse2 10960a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ 10970a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_filter_block1d4_v8_avg_sse2 10980a39d0a697ff3603e8c100300fda363658e10b23James Zern#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ 10990a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_filter_block1d4_v2_avg_sse2 11000a39d0a697ff3603e8c100300fda363658e10b23James Zern 11010a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); 11020a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, 11030a39d0a697ff3603e8c100300fda363658e10b23James Zern avx2); 11040a39d0a697ff3603e8c100300fda363658e10b23James ZernHIGH_FUN_CONV_2D(avg_, avx2); 11050a39d0a697ff3603e8c100300fda363658e10b23James Zern 11060a39d0a697ff3603e8c100300fda363658e10b23James Zern#undef HIGHBD_FUNC 1107