12ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian/*
22ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
32ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *
42ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
52ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
62ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
72ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
82ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
92ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian */
102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "./vpx_config.h"
112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "vp9/encoder/vp9_variance.h"
132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramaniantypedef void (*get_var_avx2) (
162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *src_ptr,
172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int source_stride,
182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *ref_ptr,
192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int recon_stride,
202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *SSE,
212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int *Sum
222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian);
232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_get16x16var_avx2
252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian(
262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *src_ptr,
272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int source_stride,
282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *ref_ptr,
292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int recon_stride,
302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *SSE,
312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int *Sum
322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian);
332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianvoid vp9_get32x32var_avx2
352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian(
362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *src_ptr,
372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int source_stride,
382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *ref_ptr,
392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int recon_stride,
402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *SSE,
412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int *Sum
422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian);
432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance32xh_avx2
452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian(
462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const uint8_t *src,
472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int src_stride,
482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int x_offset,
492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int y_offset,
502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const uint8_t *dst,
512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int dst_stride,
522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int height,
532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *sse
542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian);
552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance32xh_avx2
572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian(
582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const uint8_t *src,
592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int src_stride,
602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int x_offset,
612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int y_offset,
622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const uint8_t *dst,
632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int dst_stride,
642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const uint8_t *sec,
652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int sec_stride,
662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int height,
672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *sseptr
682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian);
692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianstatic void variance_avx2(const unsigned char *src_ptr, int  source_stride,
712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                        const unsigned char *ref_ptr, int  recon_stride,
722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                        int  w, int  h, unsigned int *sse, int *sum,
732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                        get_var_avx2 var_fn, int block_size) {
742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse0;
752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int sum0;
762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int i, j;
772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = 0;
792ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sum = 0;
802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  for (i = 0; i < h; i += 16) {
822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    for (j = 0; j < w; j += block_size) {
832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      // processing 16 rows horizontally each call
842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      var_fn(src_ptr + source_stride * i + j, source_stride,
852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      *sse += sse0;
872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian      *sum += sum0;
882ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian    }
892ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  }
902ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
912ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance16x16_avx2
932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian(
942ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *src_ptr,
952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int  source_stride,
962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *ref_ptr,
972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int  recon_stride,
982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *sse) {
992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int var;
1002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int avg;
1012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
1032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                &var, &avg, vp9_get16x16var_avx2, 16);
1042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = var;
1052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return (var - (((unsigned int)avg * avg) >> 8));
1062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_mse16x16_avx2(
1092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *src_ptr,
1102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int  source_stride,
1112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  const unsigned char *ref_ptr,
1122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int  recon_stride,
1132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int *sse) {
1142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse0;
1152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int sum0;
1162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
1172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                       &sum0);
1182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = sse0;
1192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return sse0;
1202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
1232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  source_stride,
1242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    const uint8_t *ref_ptr,
1252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  recon_stride,
1262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    unsigned int *sse) {
1272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int var;
1282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int avg;
1292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements vertically in parallel
1312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
1322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                &var, &avg, vp9_get32x32var_avx2, 32);
1332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = var;
1342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return (var - (((int64_t)avg * avg) >> 10));
1352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
1382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  source_stride,
1392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    const uint8_t *ref_ptr,
1402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  recon_stride,
1412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    unsigned int *sse) {
1422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int var;
1432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int avg;
1442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements vertically in parallel
1462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
1472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                &var, &avg, vp9_get32x32var_avx2, 32);
1482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = var;
1492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return (var - (((int64_t)avg * avg) >> 9));
1502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
1542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  source_stride,
1552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    const uint8_t *ref_ptr,
1562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  recon_stride,
1572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    unsigned int *sse) {
1582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int var;
1592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int avg;
1602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements vertically in parallel
1622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
1632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                &var, &avg, vp9_get32x32var_avx2, 32);
1642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = var;
1652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return (var - (((int64_t)avg * avg) >> 12));
1662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1682ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
1692ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  source_stride,
1702ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    const uint8_t *ref_ptr,
1712ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    int  recon_stride,
1722ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                    unsigned int *sse) {
1732ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int var;
1742ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int avg;
1752ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1762ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements vertically in parallel
1772ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
1782ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                &var, &avg, vp9_get32x32var_avx2, 32);
1792ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1802ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse = var;
1812ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return (var - (((int64_t)avg * avg) >> 11));
1822ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
1832ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
1842ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
1852ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int src_stride,
1862ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int x_offset,
1872ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int y_offset,
1882ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              const uint8_t *dst,
1892ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int dst_stride,
1902ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              unsigned int *sse_ptr) {
1912ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements in parallel
1922ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse;
1932ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
1942ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                           y_offset, dst, dst_stride,
1952ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                           64, &sse);
1962ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing the next 32 elements in parallel
1972ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse2;
1982ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
1992ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                            x_offset, y_offset,
2002ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                            dst + 32, dst_stride,
2012ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                            64, &sse2);
2022ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  se += se2;
2032ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  sse += sse2;
2042ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse_ptr = sse;
2052ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return sse - (((int64_t)se * se) >> 12);
2062ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
2072ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2082ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
2092ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int src_stride,
2102ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int x_offset,
2112ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int y_offset,
2122ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              const uint8_t *dst,
2132ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              int dst_stride,
2142ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                              unsigned int *sse_ptr) {
2152ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 element in parallel
2162ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse;
2172ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
2182ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                           y_offset, dst, dst_stride,
2192ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                           32, &sse);
2202ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sse_ptr = sse;
2212ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return sse - (((int64_t)se * se) >> 10);
2222ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
2232ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2242ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
2252ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int src_stride,
2262ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int x_offset,
2272ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int y_offset,
2282ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  const uint8_t *dst,
2292ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int dst_stride,
2302ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  unsigned int *sseptr,
2312ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  const uint8_t *sec) {
2322ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 elements in parallel
2332ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse;
2342ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2352ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
2362ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                               y_offset, dst, dst_stride,
2372ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                               sec, 64, 64, &sse);
2382ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse2;
2392ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing the next 32 elements in parallel
2402ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
2412ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                y_offset, dst + 32, dst_stride,
2422ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                sec + 32, 64, 64, &sse2);
2432ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  se += se2;
2442ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  sse += sse2;
2452ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sseptr = sse;
2462ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2472ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return sse - (((int64_t)se * se) >> 12);
2482ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
2492ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2502ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanianunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
2512ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int src_stride,
2522ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int x_offset,
2532ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int y_offset,
2542ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  const uint8_t *dst,
2552ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  int dst_stride,
2562ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  unsigned int *sseptr,
2572ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                  const uint8_t *sec) {
2582ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  // processing 32 element in parallel
2592ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  unsigned int sse;
2602ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
2612ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                 y_offset, dst, dst_stride,
2622ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian                                                 sec, 32, 32, &sse);
2632ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  *sseptr = sse;
2642ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian  return sse - (((int64_t)se * se) >> 10);
2652ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian}
2662ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
2672ec72e65689c948e92b826ae1e867bf369e72f13Vignesh Venkatasubramanian
268