1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vpx_config.h"
12#include "vp8/common/variance.h"
13#include "vpx_ports/mem.h"
14#include "vp8/common/x86/filter_x86.h"
15
16extern void filter_block1d_h6_mmx
17(
18    const unsigned char *src_ptr,
19    unsigned short *output_ptr,
20    unsigned int src_pixels_per_line,
21    unsigned int pixel_step,
22    unsigned int output_height,
23    unsigned int output_width,
24    short *filter
25);
26extern void filter_block1d_v6_mmx
27(
28    const short *src_ptr,
29    unsigned char *output_ptr,
30    unsigned int pixels_per_line,
31    unsigned int pixel_step,
32    unsigned int output_height,
33    unsigned int output_width,
34    short *filter
35);
36
37extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
38extern unsigned int vp8_get8x8var_mmx
39(
40    const unsigned char *src_ptr,
41    int  source_stride,
42    const unsigned char *ref_ptr,
43    int  recon_stride,
44    unsigned int *SSE,
45    int *Sum
46);
47extern unsigned int vp8_get4x4var_mmx
48(
49    const unsigned char *src_ptr,
50    int  source_stride,
51    const unsigned char *ref_ptr,
52    int  recon_stride,
53    unsigned int *SSE,
54    int *Sum
55);
56extern void vp8_filter_block2d_bil4x4_var_mmx
57(
58    const unsigned char *ref_ptr,
59    int ref_pixels_per_line,
60    const unsigned char *src_ptr,
61    int src_pixels_per_line,
62    const short *HFilter,
63    const short *VFilter,
64    int *sum,
65    unsigned int *sumsquared
66);
67extern void vp8_filter_block2d_bil_var_mmx
68(
69    const unsigned char *ref_ptr,
70    int ref_pixels_per_line,
71    const unsigned char *src_ptr,
72    int src_pixels_per_line,
73    unsigned int Height,
74    const short *HFilter,
75    const short *VFilter,
76    int *sum,
77    unsigned int *sumsquared
78);
79
80
81unsigned int vp8_variance4x4_mmx(
82    const unsigned char *src_ptr,
83    int  source_stride,
84    const unsigned char *ref_ptr,
85    int  recon_stride,
86    unsigned int *sse)
87{
88    unsigned int var;
89    int avg;
90
91    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
92    *sse = var;
93    return (var - (((unsigned int)avg * avg) >> 4));
94
95}
96
97unsigned int vp8_variance8x8_mmx(
98    const unsigned char *src_ptr,
99    int  source_stride,
100    const unsigned char *ref_ptr,
101    int  recon_stride,
102    unsigned int *sse)
103{
104    unsigned int var;
105    int avg;
106
107    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
108    *sse = var;
109
110    return (var - (((unsigned int)avg * avg) >> 6));
111
112}
113
114unsigned int vp8_mse16x16_mmx(
115    const unsigned char *src_ptr,
116    int  source_stride,
117    const unsigned char *ref_ptr,
118    int  recon_stride,
119    unsigned int *sse)
120{
121    unsigned int sse0, sse1, sse2, sse3, var;
122    int sum0, sum1, sum2, sum3;
123
124
125    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
126    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
127    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
128    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
129
130    var = sse0 + sse1 + sse2 + sse3;
131    *sse = var;
132    return var;
133}
134
135
136unsigned int vp8_variance16x16_mmx(
137    const unsigned char *src_ptr,
138    int  source_stride,
139    const unsigned char *ref_ptr,
140    int  recon_stride,
141    unsigned int *sse)
142{
143    unsigned int sse0, sse1, sse2, sse3, var;
144    int sum0, sum1, sum2, sum3, avg;
145
146
147    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
148    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
149    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
150    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
151
152    var = sse0 + sse1 + sse2 + sse3;
153    avg = sum0 + sum1 + sum2 + sum3;
154    *sse = var;
155    return (var - (((unsigned int)avg * avg) >> 8));
156}
157
158unsigned int vp8_variance16x8_mmx(
159    const unsigned char *src_ptr,
160    int  source_stride,
161    const unsigned char *ref_ptr,
162    int  recon_stride,
163    unsigned int *sse)
164{
165    unsigned int sse0, sse1, var;
166    int sum0, sum1, avg;
167
168    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
169    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
170
171    var = sse0 + sse1;
172    avg = sum0 + sum1;
173    *sse = var;
174    return (var - (((unsigned int)avg * avg) >> 7));
175
176}
177
178
179unsigned int vp8_variance8x16_mmx(
180    const unsigned char *src_ptr,
181    int  source_stride,
182    const unsigned char *ref_ptr,
183    int  recon_stride,
184    unsigned int *sse)
185{
186    unsigned int sse0, sse1, var;
187    int sum0, sum1, avg;
188
189    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
190    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
191
192    var = sse0 + sse1;
193    avg = sum0 + sum1;
194    *sse = var;
195
196    return (var - (((unsigned int)avg * avg) >> 7));
197
198}
199
200
201unsigned int vp8_sub_pixel_variance4x4_mmx
202(
203    const unsigned char  *src_ptr,
204    int  src_pixels_per_line,
205    int  xoffset,
206    int  yoffset,
207    const unsigned char *dst_ptr,
208    int dst_pixels_per_line,
209    unsigned int *sse)
210
211{
212    int xsum;
213    unsigned int xxsum;
214    vp8_filter_block2d_bil4x4_var_mmx(
215        src_ptr, src_pixels_per_line,
216        dst_ptr, dst_pixels_per_line,
217        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
218        &xsum, &xxsum
219    );
220    *sse = xxsum;
221    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
222}
223
224
225unsigned int vp8_sub_pixel_variance8x8_mmx
226(
227    const unsigned char  *src_ptr,
228    int  src_pixels_per_line,
229    int  xoffset,
230    int  yoffset,
231    const unsigned char *dst_ptr,
232    int dst_pixels_per_line,
233    unsigned int *sse
234)
235{
236
237    int xsum;
238    unsigned int xxsum;
239    vp8_filter_block2d_bil_var_mmx(
240        src_ptr, src_pixels_per_line,
241        dst_ptr, dst_pixels_per_line, 8,
242        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
243        &xsum, &xxsum
244    );
245    *sse = xxsum;
246    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
247}
248
249unsigned int vp8_sub_pixel_variance16x16_mmx
250(
251    const unsigned char  *src_ptr,
252    int  src_pixels_per_line,
253    int  xoffset,
254    int  yoffset,
255    const unsigned char *dst_ptr,
256    int dst_pixels_per_line,
257    unsigned int *sse
258)
259{
260
261    int xsum0, xsum1;
262    unsigned int xxsum0, xxsum1;
263
264
265    vp8_filter_block2d_bil_var_mmx(
266        src_ptr, src_pixels_per_line,
267        dst_ptr, dst_pixels_per_line, 16,
268        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
269        &xsum0, &xxsum0
270    );
271
272
273    vp8_filter_block2d_bil_var_mmx(
274        src_ptr + 8, src_pixels_per_line,
275        dst_ptr + 8, dst_pixels_per_line, 16,
276        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
277        &xsum1, &xxsum1
278    );
279
280    xsum0 += xsum1;
281    xxsum0 += xxsum1;
282
283    *sse = xxsum0;
284    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
285
286
287}
288
289unsigned int vp8_sub_pixel_mse16x16_mmx(
290    const unsigned char  *src_ptr,
291    int  src_pixels_per_line,
292    int  xoffset,
293    int  yoffset,
294    const unsigned char *dst_ptr,
295    int dst_pixels_per_line,
296    unsigned int *sse
297)
298{
299    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
300    return *sse;
301}
302
303unsigned int vp8_sub_pixel_variance16x8_mmx
304(
305    const unsigned char  *src_ptr,
306    int  src_pixels_per_line,
307    int  xoffset,
308    int  yoffset,
309    const unsigned char *dst_ptr,
310    int dst_pixels_per_line,
311    unsigned int *sse
312)
313{
314    int xsum0, xsum1;
315    unsigned int xxsum0, xxsum1;
316
317
318    vp8_filter_block2d_bil_var_mmx(
319        src_ptr, src_pixels_per_line,
320        dst_ptr, dst_pixels_per_line, 8,
321        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
322        &xsum0, &xxsum0
323    );
324
325
326    vp8_filter_block2d_bil_var_mmx(
327        src_ptr + 8, src_pixels_per_line,
328        dst_ptr + 8, dst_pixels_per_line, 8,
329        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
330        &xsum1, &xxsum1
331    );
332
333    xsum0 += xsum1;
334    xxsum0 += xxsum1;
335
336    *sse = xxsum0;
337    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
338}
339
340unsigned int vp8_sub_pixel_variance8x16_mmx
341(
342    const unsigned char  *src_ptr,
343    int  src_pixels_per_line,
344    int  xoffset,
345    int  yoffset,
346    const unsigned char *dst_ptr,
347    int dst_pixels_per_line,
348    unsigned int *sse
349)
350{
351    int xsum;
352    unsigned int xxsum;
353    vp8_filter_block2d_bil_var_mmx(
354        src_ptr, src_pixels_per_line,
355        dst_ptr, dst_pixels_per_line, 16,
356        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
357        &xsum, &xxsum
358    );
359    *sse = xxsum;
360    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
361}
362
363
364unsigned int vp8_variance_halfpixvar16x16_h_mmx(
365    const unsigned char *src_ptr,
366    int  source_stride,
367    const unsigned char *ref_ptr,
368    int  recon_stride,
369    unsigned int *sse)
370{
371    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
372                                           ref_ptr, recon_stride, sse);
373}
374
375
376unsigned int vp8_variance_halfpixvar16x16_v_mmx(
377    const unsigned char *src_ptr,
378    int  source_stride,
379    const unsigned char *ref_ptr,
380    int  recon_stride,
381    unsigned int *sse)
382{
383    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
384                                           ref_ptr, recon_stride, sse);
385}
386
387
388unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
389    const unsigned char *src_ptr,
390    int  source_stride,
391    const unsigned char *ref_ptr,
392    int  recon_stride,
393    unsigned int *sse)
394{
395    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
396                                           ref_ptr, recon_stride, sse);
397}
398