19682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/* 29682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 39682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * 49682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * Use of this source code is governed by a BSD-style license 59682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * that can be found in the LICENSE file in the root of the source 69682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * tree. An additional intellectual property rights grant can be found 79682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * in the file PATENTS. All contributing project authors may 89682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall * be found in the AUTHORS file in the root of the source tree. 99682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall */ 109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "./vpx_config.h" 119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vp9/encoder/vp9_variance.h" 139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vp9/common/vp9_pragmas.h" 149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vpx_ports/mem.h" 159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Halltypedef void (*get_var_avx2) ( 179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *src_ptr, 189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, 209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *SSE, 229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int *Sum 239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall); 249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallvoid vp9_get16x16var_avx2 269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall( 279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *src_ptr, 289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, 309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *SSE, 329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int *Sum 339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall); 349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallvoid vp9_get32x32var_avx2 369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall( 379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *src_ptr, 389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, 409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *SSE, 429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int *Sum 439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall); 449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance32xh_avx2 469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall( 479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *src, 489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int height, 549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse 559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall); 569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance32xh_avx2 589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall( 599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *src, 609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *sec, 669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int sec_stride, 679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int height, 689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sseptr 699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall); 709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void variance_avx2(const unsigned char *src_ptr, int source_stride, 729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, int recon_stride, 739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int w, int h, unsigned int *sse, int *sum, 749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall get_var_avx2 var_fn, int block_size) { 759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse0; 769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int sum0; 779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int i, j; 789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = 0; 809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sum = 0; 819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall for (i = 0; i < h; i += 16) { 839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall for (j = 0; j < w; j += block_size) { 849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 16 rows horizontally each call 859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall var_fn(src_ptr + source_stride * i + j, source_stride, 869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse += sse0; 889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sum += sum0; 899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall } 909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall } 919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance16x16_avx2 949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall( 959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *src_ptr, 969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, 989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int var; 1019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int avg; 1029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 1049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &var, &avg, vp9_get16x16var_avx2, 16); 1059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = var; 1069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return (var - (((unsigned int)avg * avg) >> 8)); 1079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_mse16x16_avx2( 1109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *src_ptr, 1119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 1129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const unsigned char *ref_ptr, 1139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 1149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse0; 1169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int sum0; 1179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 1189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &sum0); 1199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = sse0; 1209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return sse0; 1219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, 1249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 1259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *ref_ptr, 1269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 1279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int var; 1299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int avg; 1309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements vertically in parallel 1329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 1339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &var, &avg, vp9_get32x32var_avx2, 32); 1349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = var; 1359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return (var - (((int64_t)avg * avg) >> 10)); 1369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, 1399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 1409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *ref_ptr, 1419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 1429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int var; 1449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int avg; 1459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements vertically in parallel 1479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 1489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &var, &avg, vp9_get32x32var_avx2, 32); 1499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = var; 1509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return (var - (((int64_t)avg * avg) >> 9)); 1519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, 1559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 1569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *ref_ptr, 1579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 1589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int var; 1609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int avg; 1619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements vertically in parallel 1639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 1649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &var, &avg, vp9_get32x32var_avx2, 32); 1659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = var; 1669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return (var - (((int64_t)avg * avg) >> 12)); 1679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, 1709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int source_stride, 1719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *ref_ptr, 1729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int recon_stride, 1739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse) { 1749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int var; 1759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int avg; 1769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements vertically in parallel 1789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 1799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall &var, &avg, vp9_get32x32var_avx2, 32); 1809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse = var; 1829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return (var - (((int64_t)avg * avg) >> 11)); 1839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 1849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 1859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, 1869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 1879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 1889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 1899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 1909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 1919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse_ptr) { 1929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements in parallel 1939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse; 1949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 1959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall y_offset, dst, dst_stride, 1969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 64, &sse); 1979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing the next 32 elements in parallel 1989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse2; 1999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, 2009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall x_offset, y_offset, 2019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall dst + 32, dst_stride, 2029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 64, &sse2); 2039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall se += se2; 2049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall sse += sse2; 2059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse_ptr = sse; 2069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return sse - (((int64_t)se * se) >> 12); 2079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 2089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, 2109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 2119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 2129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 2139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 2149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 2159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sse_ptr) { 2169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 element in parallel 2179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse; 2189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 2199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall y_offset, dst, dst_stride, 2209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 32, &sse); 2219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sse_ptr = sse; 2229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return sse - (((int64_t)se * se) >> 10); 2239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 2249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 2269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 2279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 2289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 2299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 2309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 2319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sseptr, 2329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *sec) { 2339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 elements in parallel 2349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse; 2359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 2379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall y_offset, dst, dst_stride, 2389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall sec, 64, 64, &sse); 2399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse2; 2409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing the next 32 elements in parallel 2419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, 2429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall y_offset, dst + 32, dst_stride, 2439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall sec + 32, 64, 64, &sse2); 2449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall se += se2; 2459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall sse += sse2; 2469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sseptr = sse; 2479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return sse - (((int64_t)se * se) >> 12); 2499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 2509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 2529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int src_stride, 2539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int x_offset, 2549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int y_offset, 2559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *dst, 2569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int dst_stride, 2579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int *sseptr, 2589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall const uint8_t *sec) { 2599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall // processing 32 element in parallel 2609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall unsigned int sse; 2619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 2629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall y_offset, dst, dst_stride, 2639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall sec, 32, 32, &sse); 2649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *sseptr = sse; 2659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall return sse - (((int64_t)se * se) >> 10); 2669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall} 2679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall 2699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall