1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vp8/encoder/variance.h"
13#include "vp8/common/pragmas.h"
14#include "vpx_ports/mem.h"
15
16extern void filter_block1d_h6_mmx
17(
18    const unsigned char *src_ptr,
19    unsigned short *output_ptr,
20    unsigned int src_pixels_per_line,
21    unsigned int pixel_step,
22    unsigned int output_height,
23    unsigned int output_width,
24    short *vp7_filter
25);
26extern void filter_block1d_v6_mmx
27(
28    const short *src_ptr,
29    unsigned char *output_ptr,
30    unsigned int pixels_per_line,
31    unsigned int pixel_step,
32    unsigned int output_height,
33    unsigned int output_width,
34    short *vp7_filter
35);
36
37extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
38extern unsigned int vp8_get8x8var_mmx
39(
40    const unsigned char *src_ptr,
41    int  source_stride,
42    const unsigned char *ref_ptr,
43    int  recon_stride,
44    unsigned int *SSE,
45    int *Sum
46);
47extern unsigned int vp8_get4x4var_mmx
48(
49    const unsigned char *src_ptr,
50    int  source_stride,
51    const unsigned char *ref_ptr,
52    int  recon_stride,
53    unsigned int *SSE,
54    int *Sum
55);
56extern void vp8_filter_block2d_bil4x4_var_mmx
57(
58    const unsigned char *ref_ptr,
59    int ref_pixels_per_line,
60    const unsigned char *src_ptr,
61    int src_pixels_per_line,
62    const short *HFilter,
63    const short *VFilter,
64    int *sum,
65    unsigned int *sumsquared
66);
67extern void vp8_filter_block2d_bil_var_mmx
68(
69    const unsigned char *ref_ptr,
70    int ref_pixels_per_line,
71    const unsigned char *src_ptr,
72    int src_pixels_per_line,
73    unsigned int Height,
74    const short *HFilter,
75    const short *VFilter,
76    int *sum,
77    unsigned int *sumsquared
78);
79extern unsigned int vp8_get16x16pred_error_mmx
80(
81    unsigned char *src_ptr,
82    int src_stride,
83    unsigned char *ref_ptr,
84    int ref_stride
85);
86
87
88unsigned int vp8_get16x16var_mmx(
89    const unsigned char *src_ptr,
90    int  source_stride,
91    const unsigned char *ref_ptr,
92    int  recon_stride,
93    unsigned *SSE,
94    unsigned *SUM
95)
96{
97    unsigned int sse0, sse1, sse2, sse3, var;
98    int sum0, sum1, sum2, sum3, avg;
99
100
101    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
102    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
103    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
104    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
105
106    var = sse0 + sse1 + sse2 + sse3;
107    avg = sum0 + sum1 + sum2 + sum3;
108
109    *SSE = var;
110    *SUM = avg;
111    return (var - ((avg * avg) >> 8));
112
113}
114
115
116
117
118
119unsigned int vp8_variance4x4_mmx(
120    const unsigned char *src_ptr,
121    int  source_stride,
122    const unsigned char *ref_ptr,
123    int  recon_stride,
124    unsigned int *sse)
125{
126    unsigned int var;
127    int avg;
128
129    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
130    *sse = var;
131    return (var - ((avg * avg) >> 4));
132
133}
134
135unsigned int vp8_variance8x8_mmx(
136    const unsigned char *src_ptr,
137    int  source_stride,
138    const unsigned char *ref_ptr,
139    int  recon_stride,
140    unsigned int *sse)
141{
142    unsigned int var;
143    int avg;
144
145    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
146    *sse = var;
147
148    return (var - ((avg * avg) >> 6));
149
150}
151
152unsigned int vp8_mse16x16_mmx(
153    const unsigned char *src_ptr,
154    int  source_stride,
155    const unsigned char *ref_ptr,
156    int  recon_stride,
157    unsigned int *sse)
158{
159    unsigned int sse0, sse1, sse2, sse3, var;
160    int sum0, sum1, sum2, sum3;
161
162
163    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
164    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
165    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
166    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
167
168    var = sse0 + sse1 + sse2 + sse3;
169    *sse = var;
170    return var;
171}
172
173
174unsigned int vp8_variance16x16_mmx(
175    const unsigned char *src_ptr,
176    int  source_stride,
177    const unsigned char *ref_ptr,
178    int  recon_stride,
179    int *sse)
180{
181    unsigned int sse0, sse1, sse2, sse3, var;
182    int sum0, sum1, sum2, sum3, avg;
183
184
185    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
186    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
187    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
188    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
189
190    var = sse0 + sse1 + sse2 + sse3;
191    avg = sum0 + sum1 + sum2 + sum3;
192    *sse = var;
193    return (var - ((avg * avg) >> 8));
194}
195
196unsigned int vp8_variance16x8_mmx(
197    const unsigned char *src_ptr,
198    int  source_stride,
199    const unsigned char *ref_ptr,
200    int  recon_stride,
201    unsigned int *sse)
202{
203    unsigned int sse0, sse1, var;
204    int sum0, sum1, avg;
205
206    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
207    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
208
209    var = sse0 + sse1;
210    avg = sum0 + sum1;
211    *sse = var;
212    return (var - ((avg * avg) >> 7));
213
214}
215
216
217unsigned int vp8_variance8x16_mmx(
218    const unsigned char *src_ptr,
219    int  source_stride,
220    const unsigned char *ref_ptr,
221    int  recon_stride,
222    unsigned int *sse)
223{
224    unsigned int sse0, sse1, var;
225    int sum0, sum1, avg;
226
227    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
228    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
229
230    var = sse0 + sse1;
231    avg = sum0 + sum1;
232    *sse = var;
233
234    return (var - ((avg * avg) >> 7));
235
236}
237
238
239
240
241///////////////////////////////////////////////////////////////////////////
242// the mmx function that does the bilinear filtering and var calculation //
243// int one pass                                                          //
244///////////////////////////////////////////////////////////////////////////
245DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
246{
247    { 128, 128, 128, 128,  0,  0,  0,  0 },
248    { 112, 112, 112, 112, 16, 16, 16, 16 },
249    {  96, 96, 96, 96, 32, 32, 32, 32 },
250    {  80, 80, 80, 80, 48, 48, 48, 48 },
251    {  64, 64, 64, 64, 64, 64, 64, 64 },
252    {  48, 48, 48, 48, 80, 80, 80, 80 },
253    {  32, 32, 32, 32, 96, 96, 96, 96 },
254    {  16, 16, 16, 16, 112, 112, 112, 112 }
255};
256
257unsigned int vp8_sub_pixel_variance4x4_mmx
258(
259    const unsigned char  *src_ptr,
260    int  src_pixels_per_line,
261    int  xoffset,
262    int  yoffset,
263    const unsigned char *dst_ptr,
264    int dst_pixels_per_line,
265    unsigned int *sse)
266
267{
268    int xsum;
269    unsigned int xxsum;
270    vp8_filter_block2d_bil4x4_var_mmx(
271        src_ptr, src_pixels_per_line,
272        dst_ptr, dst_pixels_per_line,
273        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
274        &xsum, &xxsum
275    );
276    *sse = xxsum;
277    return (xxsum - ((xsum * xsum) >> 4));
278}
279
280
281unsigned int vp8_sub_pixel_variance8x8_mmx
282(
283    const unsigned char  *src_ptr,
284    int  src_pixels_per_line,
285    int  xoffset,
286    int  yoffset,
287    const unsigned char *dst_ptr,
288    int dst_pixels_per_line,
289    unsigned int *sse
290)
291{
292
293    int xsum;
294    unsigned int xxsum;
295    vp8_filter_block2d_bil_var_mmx(
296        src_ptr, src_pixels_per_line,
297        dst_ptr, dst_pixels_per_line, 8,
298        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
299        &xsum, &xxsum
300    );
301    *sse = xxsum;
302    return (xxsum - ((xsum * xsum) >> 6));
303}
304
305unsigned int vp8_sub_pixel_variance16x16_mmx
306(
307    const unsigned char  *src_ptr,
308    int  src_pixels_per_line,
309    int  xoffset,
310    int  yoffset,
311    const unsigned char *dst_ptr,
312    int dst_pixels_per_line,
313    unsigned int *sse
314)
315{
316
317    int xsum0, xsum1;
318    unsigned int xxsum0, xxsum1;
319
320
321    vp8_filter_block2d_bil_var_mmx(
322        src_ptr, src_pixels_per_line,
323        dst_ptr, dst_pixels_per_line, 16,
324        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
325        &xsum0, &xxsum0
326    );
327
328
329    vp8_filter_block2d_bil_var_mmx(
330        src_ptr + 8, src_pixels_per_line,
331        dst_ptr + 8, dst_pixels_per_line, 16,
332        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
333        &xsum1, &xxsum1
334    );
335
336    xsum0 += xsum1;
337    xxsum0 += xxsum1;
338
339    *sse = xxsum0;
340    return (xxsum0 - ((xsum0 * xsum0) >> 8));
341
342
343}
344
345unsigned int vp8_sub_pixel_mse16x16_mmx(
346    const unsigned char  *src_ptr,
347    int  src_pixels_per_line,
348    int  xoffset,
349    int  yoffset,
350    const unsigned char *dst_ptr,
351    int dst_pixels_per_line,
352    unsigned int *sse
353)
354{
355    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
356    return *sse;
357}
358
359unsigned int vp8_sub_pixel_variance16x8_mmx
360(
361    const unsigned char  *src_ptr,
362    int  src_pixels_per_line,
363    int  xoffset,
364    int  yoffset,
365    const unsigned char *dst_ptr,
366    int dst_pixels_per_line,
367    unsigned int *sse
368)
369{
370    int xsum0, xsum1;
371    unsigned int xxsum0, xxsum1;
372
373
374    vp8_filter_block2d_bil_var_mmx(
375        src_ptr, src_pixels_per_line,
376        dst_ptr, dst_pixels_per_line, 8,
377        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
378        &xsum0, &xxsum0
379    );
380
381
382    vp8_filter_block2d_bil_var_mmx(
383        src_ptr + 8, src_pixels_per_line,
384        dst_ptr + 8, dst_pixels_per_line, 8,
385        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
386        &xsum1, &xxsum1
387    );
388
389    xsum0 += xsum1;
390    xxsum0 += xxsum1;
391
392    *sse = xxsum0;
393    return (xxsum0 - ((xsum0 * xsum0) >> 7));
394}
395
396unsigned int vp8_sub_pixel_variance8x16_mmx
397(
398    const unsigned char  *src_ptr,
399    int  src_pixels_per_line,
400    int  xoffset,
401    int  yoffset,
402    const unsigned char *dst_ptr,
403    int dst_pixels_per_line,
404    int *sse
405)
406{
407    int xsum;
408    unsigned int xxsum;
409    vp8_filter_block2d_bil_var_mmx(
410        src_ptr, src_pixels_per_line,
411        dst_ptr, dst_pixels_per_line, 16,
412        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
413        &xsum, &xxsum
414    );
415    *sse = xxsum;
416    return (xxsum - ((xsum * xsum) >> 7));
417}
418
419
420unsigned int vp8_variance_halfpixvar16x16_h_mmx(
421    const unsigned char *src_ptr,
422    int  source_stride,
423    const unsigned char *ref_ptr,
424    int  recon_stride,
425    unsigned int *sse)
426{
427    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
428                                           ref_ptr, recon_stride, sse);
429}
430
431
432unsigned int vp8_variance_halfpixvar16x16_v_mmx(
433    const unsigned char *src_ptr,
434    int  source_stride,
435    const unsigned char *ref_ptr,
436    int  recon_stride,
437    unsigned int *sse)
438{
439    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
440                                           ref_ptr, recon_stride, sse);
441}
442
443
444unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
445    const unsigned char *src_ptr,
446    int  source_stride,
447    const unsigned char *ref_ptr,
448    int  recon_stride,
449    unsigned int *sse)
450{
451    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
452                                           ref_ptr, recon_stride, sse);
453}
454