19682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall/*
29682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
39682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *
49682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  Use of this source code is governed by a BSD-style license
59682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  that can be found in the LICENSE file in the root of the source
69682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  tree. An additional intellectual property rights grant can be found
79682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  in the file PATENTS.  All contributing project authors may
89682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall *  be found in the AUTHORS file in the root of the source tree.
99682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall */
109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "./vpx_config.h"
119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vp9/encoder/vp9_variance.h"
139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vp9/common/vp9_pragmas.h"
149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall#include "vpx_ports/mem.h"
159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Halltypedef void (*get_var_avx2) (
179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *src_ptr,
189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int source_stride,
199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *ref_ptr,
209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int recon_stride,
219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *SSE,
229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int *Sum
239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall);
249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallvoid vp9_get16x16var_avx2
269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall(
279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *src_ptr,
289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int source_stride,
299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *ref_ptr,
309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int recon_stride,
319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *SSE,
329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int *Sum
339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall);
349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallvoid vp9_get32x32var_avx2
369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall(
379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *src_ptr,
389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int source_stride,
399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *ref_ptr,
409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int recon_stride,
419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *SSE,
429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int *Sum
439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall);
449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance32xh_avx2
469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall(
479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const uint8_t *src,
489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int src_stride,
499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int x_offset,
509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int y_offset,
519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const uint8_t *dst,
529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int dst_stride,
539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int height,
549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *sse
559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall);
569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance32xh_avx2
589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall(
599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const uint8_t *src,
609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int src_stride,
619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int x_offset,
629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int y_offset,
639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const uint8_t *dst,
649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int dst_stride,
659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const uint8_t *sec,
669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int sec_stride,
679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int height,
689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *sseptr
699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall);
709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallstatic void variance_avx2(const unsigned char *src_ptr, int  source_stride,
729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                        const unsigned char *ref_ptr, int  recon_stride,
739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                        int  w, int  h, unsigned int *sse, int *sum,
749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                        get_var_avx2 var_fn, int block_size) {
759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse0;
769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int sum0;
779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int i, j;
789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = 0;
809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sum = 0;
819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  for (i = 0; i < h; i += 16) {
839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    for (j = 0; j < w; j += block_size) {
849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall      // processing 16 rows horizontally each call
859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall      var_fn(src_ptr + source_stride * i + j, source_stride,
869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall      *sse += sse0;
889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall      *sum += sum0;
899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall    }
909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  }
919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance16x16_avx2
949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall(
959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *src_ptr,
969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int  source_stride,
979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *ref_ptr,
989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int  recon_stride,
999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *sse) {
1009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int var;
1019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int avg;
1029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
1049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                &var, &avg, vp9_get16x16var_avx2, 16);
1059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = var;
1069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return (var - (((unsigned int)avg * avg) >> 8));
1079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_mse16x16_avx2(
1109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *src_ptr,
1119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int  source_stride,
1129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  const unsigned char *ref_ptr,
1139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int  recon_stride,
1149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int *sse) {
1159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse0;
1169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int sum0;
1179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
1189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                       &sum0);
1199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = sse0;
1209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return sse0;
1219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
1249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  source_stride,
1259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    const uint8_t *ref_ptr,
1269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  recon_stride,
1279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    unsigned int *sse) {
1289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int var;
1299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int avg;
1309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements vertically in parallel
1329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
1339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                &var, &avg, vp9_get32x32var_avx2, 32);
1349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = var;
1359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return (var - (((int64_t)avg * avg) >> 10));
1369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
1399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  source_stride,
1409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    const uint8_t *ref_ptr,
1419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  recon_stride,
1429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    unsigned int *sse) {
1439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int var;
1449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int avg;
1459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements vertically in parallel
1479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
1489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                &var, &avg, vp9_get32x32var_avx2, 32);
1499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = var;
1509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return (var - (((int64_t)avg * avg) >> 9));
1519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
1559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  source_stride,
1569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    const uint8_t *ref_ptr,
1579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  recon_stride,
1589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    unsigned int *sse) {
1599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int var;
1609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int avg;
1619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements vertically in parallel
1639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
1649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                &var, &avg, vp9_get32x32var_avx2, 32);
1659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = var;
1669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return (var - (((int64_t)avg * avg) >> 12));
1679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
1709682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  source_stride,
1719682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    const uint8_t *ref_ptr,
1729682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    int  recon_stride,
1739682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                    unsigned int *sse) {
1749682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int var;
1759682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int avg;
1769682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1779682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements vertically in parallel
1789682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
1799682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                &var, &avg, vp9_get32x32var_avx2, 32);
1809682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1819682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse = var;
1829682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return (var - (((int64_t)avg * avg) >> 11));
1839682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
1849682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
1859682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
1869682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int src_stride,
1879682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int x_offset,
1889682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int y_offset,
1899682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              const uint8_t *dst,
1909682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int dst_stride,
1919682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              unsigned int *sse_ptr) {
1929682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements in parallel
1939682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse;
1949682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
1959682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                           y_offset, dst, dst_stride,
1969682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                           64, &sse);
1979682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing the next 32 elements in parallel
1989682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse2;
1999682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
2009682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                            x_offset, y_offset,
2019682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                            dst + 32, dst_stride,
2029682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                            64, &sse2);
2039682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  se += se2;
2049682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  sse += sse2;
2059682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse_ptr = sse;
2069682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return sse - (((int64_t)se * se) >> 12);
2079682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2089682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2099682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
2109682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int src_stride,
2119682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int x_offset,
2129682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int y_offset,
2139682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              const uint8_t *dst,
2149682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              int dst_stride,
2159682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                              unsigned int *sse_ptr) {
2169682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 element in parallel
2179682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse;
2189682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
2199682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                           y_offset, dst, dst_stride,
2209682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                           32, &sse);
2219682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sse_ptr = sse;
2229682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return sse - (((int64_t)se * se) >> 10);
2239682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2249682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2259682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
2269682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int src_stride,
2279682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int x_offset,
2289682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int y_offset,
2299682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  const uint8_t *dst,
2309682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int dst_stride,
2319682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  unsigned int *sseptr,
2329682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  const uint8_t *sec) {
2339682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 elements in parallel
2349682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse;
2359682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2369682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
2379682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                               y_offset, dst, dst_stride,
2389682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                               sec, 64, 64, &sse);
2399682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse2;
2409682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing the next 32 elements in parallel
2419682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
2429682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                y_offset, dst + 32, dst_stride,
2439682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                sec + 32, 64, 64, &sse2);
2449682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  se += se2;
2459682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  sse += sse2;
2469682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sseptr = sse;
2479682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2489682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return sse - (((int64_t)se * se) >> 12);
2499682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2509682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2519682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hallunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
2529682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int src_stride,
2539682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int x_offset,
2549682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int y_offset,
2559682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  const uint8_t *dst,
2569682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  int dst_stride,
2579682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  unsigned int *sseptr,
2589682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                  const uint8_t *sec) {
2599682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  // processing 32 element in parallel
2609682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  unsigned int sse;
2619682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
2629682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                 y_offset, dst, dst_stride,
2639682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall                                                 sec, 32, 32, &sse);
2649682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  *sseptr = sse;
2659682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall  return sse - (((int64_t)se * se) >> 10);
2669682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall}
2679682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2689682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall
2699682c8870b8ff5e4ac2e4c70b759f791c6f38c1fJesse Hall