1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_config.h"
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/variance.h"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/pragmas.h"
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/x86/filter_x86.h"
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
18233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
19233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
20233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan
22233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void vp8_filter_block2d_bil4x4_var_mmx
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *HFilter,
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *VFilter,
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan
34233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern unsigned int vp8_get4x4var_mmx
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *SSE,
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *Sum
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan
44233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_get_mb_ss_sse2
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *src_ptr
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
48233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_get16x16var_sse2
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int source_stride,
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int recon_stride,
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *SSE,
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *Sum
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
57233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_get8x8var_sse2
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int source_stride,
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int recon_stride,
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *SSE,
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *Sum
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
66233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_filter_block2d_bil_var_sse2
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
78233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_horiz_vert_variance8x_h_sse2
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
88233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_horiz_vert_variance16x_h_sse2
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
98233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_horiz_variance8x_h_sse2
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
108233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_horiz_variance16x_h_sse2
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
118233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_vert_variance8x_h_sse2
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
128233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_half_vert_variance16x_h_sse2
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan
139233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance4x4_wmt(
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int var;
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int avg;
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 4));
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan
155233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance8x8_wmt
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int var;
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int avg;
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 6));
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan
173233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance16x16_wmt
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0;
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0;
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = sse0;
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
189233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_mse16x16_wmt(
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0;
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0;
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = sse0;
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return sse0;
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
206233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance16x8_wmt
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, var;
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, avg;
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1;
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    avg = sum0 + sum1;
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 7));
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan
227233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance8x16_wmt
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, var;
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, avg;
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1;
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan    avg = sum0 + sum1;
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 7));
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan
248233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance4x4_wmt
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil4x4_var_mmx(
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line,
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum, &xxsum
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan
272233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance8x8_wmt
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (xoffset == 4 && yoffset == 0)
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_variance8x_h_sse2(
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 0 && yoffset == 4)
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_vert_variance8x_h_sse2(
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 4 && yoffset == 4)
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_vert_variance8x_h_sse2(
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan
320233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance16x16_wmt
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0, xsum1;
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0, xxsum1;
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    /* note we could avoid these if statements if the calling function
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan     * just called the appropriate functions inside.
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan     */
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (xoffset == 4 && yoffset == 0)
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_variance16x_h_sse2(
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 0 && yoffset == 4)
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_vert_variance16x_h_sse2(
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 4 && yoffset == 4)
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_vert_variance16x_h_sse2(
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr + 8, src_pixels_per_line,
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr + 8, dst_pixels_per_line, 16,
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum1, &xxsum1
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan        );
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xsum0 += xsum1;
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xxsum0 += xxsum1;
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan
382233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_mse16x16_wmt(
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return *sse;
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan
396233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance16x8_wmt
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0, xsum1;
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0, xxsum1;
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (xoffset == 4 && yoffset == 0)
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_variance16x_h_sse2(
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 0 && yoffset == 4)
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_vert_variance16x_h_sse2(
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 4 && yoffset == 4)
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_vert_variance16x_h_sse2(
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 8,
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum0, &xxsum0);
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr + 8, src_pixels_per_line,
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr + 8, dst_pixels_per_line, 8,
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum1, &xxsum1);
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xsum0 += xsum1;
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan        xxsum0 += xxsum1;
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan
453233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance8x16_wmt
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan    if (xoffset == 4 && yoffset == 0)
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_variance8x_h_sse2(
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 0 && yoffset == 4)
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_vert_variance8x_h_sse2(
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else if (xoffset == 4 && yoffset == 4)
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_half_horiz_vert_variance8x_h_sse2(
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    else
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_filter_block2d_bil_var_sse2(
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan            src_ptr, src_pixels_per_line,
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan            dst_ptr, dst_pixels_per_line, 16,
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan            xoffset, yoffset,
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan            &xsum, &xxsum);
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan
502233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_h_wmt(
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  dst_pixels_per_line,
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0;
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0;
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_half_horiz_variance16x_h_sse2(
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 16,
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum0, &xxsum0);
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan
522233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_v_wmt(
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  dst_pixels_per_line,
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0;
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0;
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_half_vert_variance16x_h_sse2(
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 16,
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum0, &xxsum0);
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan
541233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_hv_wmt(
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  dst_pixels_per_line,
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0;
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0;
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_half_horiz_vert_variance16x_h_sse2(
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 16,
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum0, &xxsum0);
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
559