1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vpx_config.h"
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/encoder/vp9_variance.h"
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_pragmas.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h"
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern unsigned int vp9_get4x4var_mmx
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan
27233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_get16x16var_sse2
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
36233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_get8x8var_sse2
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
45233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_horiz_vert_variance8x_h_sse2
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
55233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_horiz_vert_variance16x_h_sse2
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
65233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_horiz_variance8x_h_sse2
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
75233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_horiz_variance16x_h_sse2
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
85233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_vert_variance8x_h_sse2
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
95233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_half_vert_variance16x_h_sse2
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int ref_pixels_per_line,
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int src_pixels_per_line,
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int Height,
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *sum,
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sumsquared
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan
106233d2500723e5594f3e7c70896ffeeef32b9c950ywantypedef unsigned int (*get_var_sse2) (
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int source_stride,
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int recon_stride,
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *SSE,
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int *Sum
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void variance_sse2(const unsigned char *src_ptr, int  source_stride,
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        const unsigned char *ref_ptr, int  recon_stride,
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        int  w, int  h, unsigned int *sse, int *sum,
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        get_var_sse2 var_fn, int block_size) {
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse0;
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int sum0;
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i, j;
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = 0;
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sum = 0;
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan  for (i = 0; i < h; i += block_size) {
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    for (j = 0; j < w; j += block_size) {
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan      var_fn(src_ptr + source_stride * i + j, source_stride,
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan      *sse += sse0;
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan      *sum += sum0;
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
136233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance4x4_sse2(
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  &var, &avg, vp9_get4x4var_mmx, 4);
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 4));
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  int  source_stride,
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *ref_ptr,
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  int  recon_stride,
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  unsigned int *sse) {
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  &var, &avg, vp9_get4x4var_mmx, 4);
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 5));
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan
165233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  int  source_stride,
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *ref_ptr,
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  int  recon_stride,
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  unsigned int *sse) {
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  &var, &avg, vp9_get4x4var_mmx, 4);
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 5));
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance8x8_sse2
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  &var, &avg, vp9_get8x8var_sse2, 8);
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 6));
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
195233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance16x8_sse2
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  &var, &avg, vp9_get8x8var_sse2, 8);
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 7));
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan
211233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance8x16_sse2
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get8x8var_sse2, 8);
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 7));
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan
227233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance16x16_sse2
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((unsigned int)avg * avg) >> 8));
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan
243233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_mse16x16_sse2(
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  source_stride,
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *ref_ptr,
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  recon_stride,
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse0;
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int sum0;
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       &sum0);
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = sse0;
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse0;
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
257233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 10));
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 9));
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan
285233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 9));
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan
299233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 12));
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan
313233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 11));
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  source_stride,
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *ref_ptr,
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    int  recon_stride,
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    unsigned int *sse) {
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int var;
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int avg;
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan                &var, &avg, vp9_get16x16var_sse2, 16);
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = var;
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (var - (((int64_t)avg * avg) >> 11));
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define DECL(w, opt) \
342233d2500723e5594f3e7c70896ffeeef32b9c950ywanint vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        ptrdiff_t src_stride, \
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        int x_offset, int y_offset, \
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        const uint8_t *dst, \
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        ptrdiff_t dst_stride, \
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        int height, unsigned int *sse)
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define DECLS(opt1, opt2) \
349233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(4, opt2); \
350233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(8, opt1); \
351233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(16, opt1)
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLS(sse2, sse);
354233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLS(ssse3, ssse3);
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef DECLS
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef DECL
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
359233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     int src_stride, \
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     int x_offset, \
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     int y_offset, \
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     const uint8_t *dst, \
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     int dst_stride, \
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     unsigned int *sse_ptr) { \
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse; \
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                y_offset, dst, dst_stride, \
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                h, &sse); \
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (w > wf) { \
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse2; \
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                   x_offset, y_offset, \
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                   dst + 16, dst_stride, \
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                   h, &sse2); \
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    se += se2; \
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sse += sse2; \
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (w > wf * 2) { \
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 x_offset, y_offset, \
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 dst + 32, dst_stride, \
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 h, &sse2); \
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se += se2; \
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sse += sse2; \
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 x_offset, y_offset, \
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 dst + 48, dst_stride, \
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                 h, &sse2); \
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se += se2; \
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sse += sse2; \
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    } \
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } \
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse_ptr = sse; \
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define FNS(opt1, opt2) \
398233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(64, 64, 16, 6, 6, opt1, (int64_t)); \
399233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(64, 32, 16, 6, 5, opt1, (int64_t)); \
400233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 64, 16, 5, 6, opt1, (int64_t)); \
401233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 32, 16, 5, 5, opt1, (int64_t)); \
402233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 16, 16, 5, 4, opt1, (int64_t)); \
403233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16, 32, 16, 4, 5, opt1, (int64_t)); \
404233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
405233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
406233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
407233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
408233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
409233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
410233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(4,   4,  4, 2, 2, opt2, (unsigned int))
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan
412233d2500723e5594f3e7c70896ffeeef32b9c950ywanFNS(sse2, sse);
413233d2500723e5594f3e7c70896ffeeef32b9c950ywanFNS(ssse3, ssse3);
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef FNS
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef FN
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define DECL(w, opt) \
419233d2500723e5594f3e7c70896ffeeef32b9c950ywanint vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            ptrdiff_t src_stride, \
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            int x_offset, int y_offset, \
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            const uint8_t *dst, \
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            ptrdiff_t dst_stride, \
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            const uint8_t *sec, \
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            ptrdiff_t sec_stride, \
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            int height, unsigned int *sse)
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define DECLS(opt1, opt2) \
428233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(4, opt2); \
429233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(8, opt1); \
430233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECL(16, opt1)
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan
432233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLS(sse2, sse);
433233d2500723e5594f3e7c70896ffeeef32b9c950ywanDECLS(ssse3, ssse3);
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef DECL
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef DECLS
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
438233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         int src_stride, \
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         int x_offset, \
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         int y_offset, \
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         const uint8_t *dst, \
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         int dst_stride, \
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         unsigned int *sseptr, \
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         const uint8_t *sec) { \
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int sse; \
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                    y_offset, dst, dst_stride, \
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                    sec, w, h, &sse); \
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (w > wf) { \
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse2; \
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                       x_offset, y_offset, \
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                       dst + 16, dst_stride, \
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                       sec + 16, w, h, &sse2); \
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    se += se2; \
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    sse += sse2; \
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (w > wf * 2) { \
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     x_offset, y_offset, \
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     dst + 32, dst_stride, \
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     sec + 32, w, h, &sse2); \
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se += se2; \
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sse += sse2; \
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     x_offset, y_offset, \
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     dst + 48, dst_stride, \
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                     sec + 48, w, h, &sse2); \
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan      se += se2; \
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sse += sse2; \
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    } \
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } \
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sseptr = sse; \
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan#define FNS(opt1, opt2) \
478233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(64, 64, 16, 6, 6, opt1, (int64_t)); \
479233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(64, 32, 16, 6, 5, opt1, (int64_t)); \
480233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 64, 16, 5, 6, opt1, (int64_t)); \
481233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 32, 16, 5, 5, opt1, (int64_t)); \
482233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(32, 16, 16, 5, 4, opt1, (int64_t)); \
483233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16, 32, 16, 4, 5, opt1, (int64_t)); \
484233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
485233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
486233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
487233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
488233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
489233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
490233d2500723e5594f3e7c70896ffeeef32b9c950ywanFN(4,   4,  4, 2, 2, opt2, (unsigned int))
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan
492233d2500723e5594f3e7c70896ffeeef32b9c950ywanFNS(sse2, sse);
493233d2500723e5594f3e7c70896ffeeef32b9c950ywanFNS(ssse3, ssse3);
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef FNS
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan#undef FN
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance_halfpixvar16x16_h_sse2(
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  src_pixels_per_line,
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *dst_ptr,
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  dst_pixels_per_line,
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int xsum0;
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int xxsum0;
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_half_horiz_variance16x_h_sse2(
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan    src_ptr, src_pixels_per_line,
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst_ptr, dst_pixels_per_line, 16,
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    &xsum0, &xxsum0);
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = xxsum0;
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan
517233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance_halfpixvar16x16_v_sse2(
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  src_pixels_per_line,
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *dst_ptr,
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  dst_pixels_per_line,
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int xsum0;
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int xxsum0;
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_half_vert_variance16x_h_sse2(
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan    src_ptr, src_pixels_per_line,
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst_ptr, dst_pixels_per_line, 16,
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan    &xsum0, &xxsum0);
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = xxsum0;
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan
535233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp9_variance_halfpixvar16x16_hv_sse2(
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *src_ptr,
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  src_pixels_per_line,
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const unsigned char *dst_ptr,
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int  dst_pixels_per_line,
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int *sse) {
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int xsum0;
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned int xxsum0;
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_half_horiz_vert_variance16x_h_sse2(
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    src_ptr, src_pixels_per_line,
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst_ptr, dst_pixels_per_line, 16,
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan    &xsum0, &xxsum0);
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan  *sse = xxsum0;
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
552