1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vpx_config.h"
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/encoder/vp9_variance.h"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_pragmas.h"
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan
16233d2500723e5594f3e7c70896ffeeef32b9c950ywantypedef void (*get_var_avx2) (
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan
25233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_get16x16var_avx2
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan
35233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_get32x32var_avx2
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan
45233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_variance32xh_avx2
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *src,
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_stride,
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x_offset,
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int y_offset,
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *dst,
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int dst_stride,
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int height,
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan
57233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_avg_variance32xh_avx2
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *src,
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_stride,
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int x_offset,
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int y_offset,
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *dst,
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int dst_stride,
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const uint8_t *sec,
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int sec_stride,
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int height,
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sseptr
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan
71233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void variance_avx2(const unsigned char *src_ptr, int  source_stride,
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        const unsigned char *ref_ptr, int  recon_stride,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        int  w, int  h, unsigned int *sse, int *sum,
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        get_var_avx2 var_fn, int block_size) {
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse0;
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int sum0;
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = 0;
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sum = 0;
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < h; i += 16) {
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < w; j += block_size) {
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan      // processing 16 rows horizontally each call
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan      var_fn(src_ptr + source_stride * i + j, source_stride,
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan      *sse += sse0;
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan      *sum += sum0;
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan
93233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance16x16_avx2
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_avx2, 16);
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 8));
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
109233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_mse16x16_avx2(
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse0;
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int sum0;
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       &sum0);
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = sse0;
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse0;
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan
123233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements vertically in parallel
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get32x32var_avx2, 32);
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 10));
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements vertically in parallel
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get32x32var_avx2, 32);
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 9));
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan
154233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements vertically in parallel
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get32x32var_avx2, 32);
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 12));
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
169233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements vertically in parallel
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get32x32var_avx2, 32);
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 11));
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int src_stride,
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int x_offset,
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int y_offset,
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              const uint8_t *dst,
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int dst_stride,
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              unsigned int *sse_ptr) {
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements in parallel
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse;
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           y_offset, dst, dst_stride,
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           64, &sse);
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing the next 32 elements in parallel
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse2;
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            x_offset, y_offset,
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            dst + 32, dst_stride,
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            64, &sse2);
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan  se += se2;
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan  sse += sse2;
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse_ptr = sse;
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - (((int64_t)se * se) >> 12);
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan
209233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int src_stride,
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int x_offset,
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int y_offset,
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              const uint8_t *dst,
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              int dst_stride,
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                              unsigned int *sse_ptr) {
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 element in parallel
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse;
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           y_offset, dst, dst_stride,
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           32, &sse);
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse_ptr = sse;
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - (((int64_t)se * se) >> 10);
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
225233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int src_stride,
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int x_offset,
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int y_offset,
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  const uint8_t *dst,
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int dst_stride,
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  unsigned int *sseptr,
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  const uint8_t *sec) {
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 elements in parallel
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse;
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                               y_offset, dst, dst_stride,
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                               sec, 64, 64, &sse);
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse2;
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing the next 32 elements in parallel
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                y_offset, dst + 32, dst_stride,
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                sec + 32, 64, 64, &sse2);
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan  se += se2;
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan  sse += sse2;
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sseptr = sse;
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - (((int64_t)se * se) >> 12);
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan
251233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int src_stride,
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int x_offset,
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int y_offset,
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  const uint8_t *dst,
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  int dst_stride,
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  unsigned int *sseptr,
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                  const uint8_t *sec) {
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // processing 32 element in parallel
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse;
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 y_offset, dst, dst_stride,
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 sec, 32, 32, &sse);
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sseptr = sse;
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - (((int64_t)se * se) >> 10);
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan
269