1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_config.h"
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/variance.h"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/pragmas.h"
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h"
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp8/common/x86/filter_x86.h"
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan
17233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d_h6_mmx
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned short *output_ptr,
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int src_pixels_per_line,
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int pixel_step,
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int output_height,
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int output_width,
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    short *filter
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
27233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void filter_block1d_v6_mmx
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *src_ptr,
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *output_ptr,
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int pixels_per_line,
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int pixel_step,
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int output_height,
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int output_width,
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    short *filter
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan
38233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
39233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern unsigned int vp8_get8x8var_mmx
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *SSE,
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *Sum
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
48233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern unsigned int vp8_get4x4var_mmx
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *SSE,
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *Sum
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
57233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void vp8_filter_block2d_bil4x4_var_mmx
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *HFilter,
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *VFilter,
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
68233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern void vp8_filter_block2d_bil_var_mmx
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int ref_pixels_per_line,
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int src_pixels_per_line,
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int Height,
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *HFilter,
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const short *VFilter,
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int *sum,
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sumsquared
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan);
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan
82233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance4x4_mmx(
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int var;
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int avg;
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 4));
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan
98233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance8x8_mmx(
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int var;
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int avg;
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 6));
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_mse16x16_mmx(
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, sse2, sse3, var;
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, sum2, sum3;
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1 + sse2 + sse3;
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return var;
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan
137233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance16x16_mmx(
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, sse2, sse3, var;
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, sum2, sum3, avg;
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1 + sse2 + sse3;
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    avg = sum0 + sum1 + sum2 + sum3;
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 8));
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan
159233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance16x8_mmx(
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, var;
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, avg;
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1;
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    avg = sum0 + sum1;
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 7));
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan
180233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance8x16_mmx(
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int sse0, sse1, var;
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int sum0, sum1, avg;
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan    var = sse0 + sse1;
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan    avg = sum0 + sum1;
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = var;
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (var - (((unsigned int)avg * avg) >> 7));
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan
202233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance4x4_mmx
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil4x4_var_mmx(
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line,
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum, &xxsum
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan
226233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance8x8_mmx
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 8,
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum, &xxsum
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan
250233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance16x16_mmx
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0, xsum1;
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0, xxsum1;
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 16,
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum0, &xxsum0
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr + 8, src_pixels_per_line,
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr + 8, dst_pixels_per_line, 16,
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum1, &xxsum1
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan    xsum0 += xsum1;
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    xxsum0 += xxsum1;
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
290233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_mse16x16_mmx(
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return *sse;
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan
304233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance16x8_mmx
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum0, xsum1;
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum0, xxsum1;
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 8,
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum0, &xxsum0
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr + 8, src_pixels_per_line,
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr + 8, dst_pixels_per_line, 8,
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum1, &xxsum1
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan    xsum0 += xsum1;
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    xxsum0 += xxsum1;
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum0;
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_sub_pixel_variance8x16_mmx
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan(
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char  *src_ptr,
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  src_pixels_per_line,
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  xoffset,
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  yoffset,
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *dst_ptr,
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int dst_pixels_per_line,
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan)
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int xsum;
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int xxsum;
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vp8_filter_block2d_bil_var_mmx(
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src_ptr, src_pixels_per_line,
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst_ptr, dst_pixels_per_line, 16,
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan        &xsum, &xxsum
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    );
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    *sse = xxsum;
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan
365233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_h_mmx(
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
367233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           ref_ptr, recon_stride, sse);
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan
377233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_v_mmx(
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           ref_ptr, recon_stride, sse);
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan
389233d2500723e5594f3e7c70896ffeeef32b9c950ywanunsigned int vp8_variance_halfpixvar16x16_hv_mmx(
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *src_ptr,
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  source_stride,
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const unsigned char *ref_ptr,
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int  recon_stride,
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned int *sse)
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan{
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           ref_ptr, recon_stride, sse);
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
399