12ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian/* 22ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 32ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * 42ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 52ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 62ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 72ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 82ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 92ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian */ 102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "./vpx_config.h" 112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "vp9/encoder/vp9_variance.h" 132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "vpx_ports/mem.h" 142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramaniantypedef void (*get_var_avx2) ( 162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *src_ptr, 172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, 192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *SSE, 212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int *Sum 222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian); 232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_get16x16var_avx2 252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian( 262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *src_ptr, 272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, 292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *SSE, 312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int *Sum 322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian); 332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_get32x32var_avx2 352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian( 362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *src_ptr, 372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, 392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *SSE, 412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int *Sum 422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian); 432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance32xh_avx2 452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian( 462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *src, 472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int height, 532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse 542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian); 552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance32xh_avx2 572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian( 582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *src, 592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *sec, 652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int sec_stride, 662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int height, 672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sseptr 682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian); 692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianstatic void variance_avx2(const unsigned char *src_ptr, int source_stride, 712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, int recon_stride, 722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int w, int h, unsigned int *sse, int *sum, 732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian get_var_avx2 var_fn, int block_size) { 742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse0; 752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int sum0; 762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int i, j; 772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = 0; 792ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sum = 0; 802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian for (i = 0; i < h; i += 16) { 822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian for (j = 0; j < w; j += block_size) { 832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 16 rows horizontally each call 842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian var_fn(src_ptr + source_stride * i + j, source_stride, 852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); 862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse += sse0; 872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sum += sum0; 882ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian } 892ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian } 902ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 912ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance16x16_avx2 932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian( 942ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *src_ptr, 952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, 972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int var; 1002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int avg; 1012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, 1032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &var, &avg, vp9_get16x16var_avx2, 16); 1042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = var; 1052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return (var - (((unsigned int)avg * avg) >> 8)); 1062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_mse16x16_avx2( 1092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *src_ptr, 1102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 1112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const unsigned char *ref_ptr, 1122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 1132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 1142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse0; 1152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int sum0; 1162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, 1172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &sum0); 1182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = sse0; 1192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return sse0; 1202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, 1232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 1242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *ref_ptr, 1252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 1262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 1272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int var; 1282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int avg; 1292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements vertically in parallel 1312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, 1322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &var, &avg, vp9_get32x32var_avx2, 32); 1332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = var; 1342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return (var - (((int64_t)avg * avg) >> 10)); 1352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, 1382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 1392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *ref_ptr, 1402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 1412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 1422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int var; 1432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int avg; 1442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements vertically in parallel 1462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, 1472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &var, &avg, vp9_get32x32var_avx2, 32); 1482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = var; 1492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return (var - (((int64_t)avg * avg) >> 9)); 1502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, 1542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 1552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *ref_ptr, 1562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 1572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 1582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int var; 1592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int avg; 1602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements vertically in parallel 1622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, 1632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &var, &avg, vp9_get32x32var_avx2, 32); 1642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = var; 1652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return (var - (((int64_t)avg * avg) >> 12)); 1662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, 1692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int source_stride, 1702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *ref_ptr, 1712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int recon_stride, 1722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse) { 1732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int var; 1742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int avg; 1752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements vertically in parallel 1772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, 1782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian &var, &avg, vp9_get32x32var_avx2, 32); 1792ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse = var; 1812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return (var - (((int64_t)avg * avg) >> 11)); 1822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 1832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 1842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, 1852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 1862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 1872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 1882ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 1892ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 1902ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse_ptr) { 1912ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements in parallel 1922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse; 1932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 1942ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian y_offset, dst, dst_stride, 1952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 64, &sse); 1962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing the next 32 elements in parallel 1972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse2; 1982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, 1992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian x_offset, y_offset, 2002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian dst + 32, dst_stride, 2012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 64, &sse2); 2022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian se += se2; 2032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian sse += sse2; 2042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse_ptr = sse; 2052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return sse - (((int64_t)se * se) >> 12); 2062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 2072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, 2092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 2102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 2112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 2122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 2132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 2142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sse_ptr) { 2152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 element in parallel 2162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse; 2172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, 2182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian y_offset, dst, dst_stride, 2192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 32, &sse); 2202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sse_ptr = sse; 2212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return sse - (((int64_t)se * se) >> 10); 2222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 2232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, 2252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 2262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 2272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 2282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 2292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 2302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sseptr, 2312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *sec) { 2322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 elements in parallel 2332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse; 2342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 2362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian y_offset, dst, dst_stride, 2372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian sec, 64, 64, &sse); 2382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse2; 2392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing the next 32 elements in parallel 2402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, 2412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian y_offset, dst + 32, dst_stride, 2422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian sec + 32, 64, 64, &sse2); 2432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian se += se2; 2442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian sse += sse2; 2452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sseptr = sse; 2462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return sse - (((int64_t)se * se) >> 12); 2482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 2492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, 2512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int src_stride, 2522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int x_offset, 2532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int y_offset, 2542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *dst, 2552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int dst_stride, 2562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int *sseptr, 2572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian const uint8_t *sec) { 2582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian // processing 32 element in parallel 2592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian unsigned int sse; 2602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, 2612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian y_offset, dst, dst_stride, 2622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian sec, 32, 32, &sse); 2632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *sseptr = sse; 2642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian return sse - (((int64_t)se * se) >> 10); 2652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian} 2662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 2672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian 268