1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12#include "vp8/encoder/variance.h"
13#include "vp8/common/pragmas.h"
14#include "vpx_ports/mem.h"
15
16extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
17extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
18extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
19extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
20
21extern void vp8_filter_block2d_bil4x4_var_mmx
22(
23    const unsigned char *ref_ptr,
24    int ref_pixels_per_line,
25    const unsigned char *src_ptr,
26    int src_pixels_per_line,
27    const short *HFilter,
28    const short *VFilter,
29    int *sum,
30    unsigned int *sumsquared
31);
32
33extern unsigned int vp8_get4x4var_mmx
34(
35    const unsigned char *src_ptr,
36    int  source_stride,
37    const unsigned char *ref_ptr,
38    int  recon_stride,
39    unsigned int *SSE,
40    int *Sum
41);
42
43unsigned int vp8_get_mb_ss_sse2
44(
45    const short *src_ptr
46);
47unsigned int vp8_get16x16var_sse2
48(
49    const unsigned char *src_ptr,
50    int source_stride,
51    const unsigned char *ref_ptr,
52    int recon_stride,
53    unsigned int *SSE,
54    int *Sum
55);
56unsigned int vp8_get16x16pred_error_sse2
57(
58    const unsigned char *src_ptr,
59    int src_stride,
60    const unsigned char *ref_ptr,
61    int ref_stride
62);
63unsigned int vp8_get8x8var_sse2
64(
65    const unsigned char *src_ptr,
66    int source_stride,
67    const unsigned char *ref_ptr,
68    int recon_stride,
69    unsigned int *SSE,
70    int *Sum
71);
72void vp8_filter_block2d_bil_var_sse2
73(
74    const unsigned char *ref_ptr,
75    int ref_pixels_per_line,
76    const unsigned char *src_ptr,
77    int src_pixels_per_line,
78    unsigned int Height,
79    int  xoffset,
80    int  yoffset,
81    int *sum,
82    unsigned int *sumsquared
83);
84void vp8_half_horiz_vert_variance8x_h_sse2
85(
86    const unsigned char *ref_ptr,
87    int ref_pixels_per_line,
88    const unsigned char *src_ptr,
89    int src_pixels_per_line,
90    unsigned int Height,
91    int *sum,
92    unsigned int *sumsquared
93);
94void vp8_half_horiz_vert_variance16x_h_sse2
95(
96    const unsigned char *ref_ptr,
97    int ref_pixels_per_line,
98    const unsigned char *src_ptr,
99    int src_pixels_per_line,
100    unsigned int Height,
101    int *sum,
102    unsigned int *sumsquared
103);
104void vp8_half_horiz_variance8x_h_sse2
105(
106    const unsigned char *ref_ptr,
107    int ref_pixels_per_line,
108    const unsigned char *src_ptr,
109    int src_pixels_per_line,
110    unsigned int Height,
111    int *sum,
112    unsigned int *sumsquared
113);
114void vp8_half_horiz_variance16x_h_sse2
115(
116    const unsigned char *ref_ptr,
117    int ref_pixels_per_line,
118    const unsigned char *src_ptr,
119    int src_pixels_per_line,
120    unsigned int Height,
121    int *sum,
122    unsigned int *sumsquared
123);
124void vp8_half_vert_variance8x_h_sse2
125(
126    const unsigned char *ref_ptr,
127    int ref_pixels_per_line,
128    const unsigned char *src_ptr,
129    int src_pixels_per_line,
130    unsigned int Height,
131    int *sum,
132    unsigned int *sumsquared
133);
134void vp8_half_vert_variance16x_h_sse2
135(
136    const unsigned char *ref_ptr,
137    int ref_pixels_per_line,
138    const unsigned char *src_ptr,
139    int src_pixels_per_line,
140    unsigned int Height,
141    int *sum,
142    unsigned int *sumsquared
143);
144
145DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
146
147unsigned int vp8_variance4x4_wmt(
148    const unsigned char *src_ptr,
149    int  source_stride,
150    const unsigned char *ref_ptr,
151    int  recon_stride)
152{
153    unsigned int var;
154    int avg;
155
156    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
157    return (var - ((avg * avg) >> 4));
158
159}
160
161
162
163unsigned int vp8_variance8x8_wmt
164(
165    const unsigned char *src_ptr,
166    int  source_stride,
167    const unsigned char *ref_ptr,
168    int  recon_stride)
169{
170    unsigned int var;
171    int avg;
172
173    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
174
175    return (var - ((avg * avg) >> 6));
176
177}
178
179
180unsigned int vp8_variance16x16_wmt
181(
182    const unsigned char *src_ptr,
183    int  source_stride,
184    const unsigned char *ref_ptr,
185    int  recon_stride,
186    unsigned int *sse)
187{
188    unsigned int sse0;
189    int sum0;
190
191
192    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
193    *sse = sse0;
194    return (sse0 - ((sum0 * sum0) >> 8));
195}
196unsigned int vp8_mse16x16_wmt(
197    const unsigned char *src_ptr,
198    int  source_stride,
199    const unsigned char *ref_ptr,
200    int  recon_stride,
201    unsigned int *sse)
202{
203
204    unsigned int sse0;
205    int sum0;
206    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
207    *sse = sse0;
208    return sse0;
209
210}
211
212
213unsigned int vp8_variance16x8_wmt
214(
215    const unsigned char *src_ptr,
216    int  source_stride,
217    const unsigned char *ref_ptr,
218    int  recon_stride,
219    unsigned int *sse)
220{
221    unsigned int sse0, sse1, var;
222    int sum0, sum1, avg;
223
224    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
225    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
226
227    var = sse0 + sse1;
228    avg = sum0 + sum1;
229    *sse = var;
230    return (var - ((avg * avg) >> 7));
231
232}
233
234unsigned int vp8_variance8x16_wmt
235(
236    const unsigned char *src_ptr,
237    int  source_stride,
238    const unsigned char *ref_ptr,
239    int  recon_stride,
240    unsigned int *sse)
241{
242    unsigned int sse0, sse1, var;
243    int sum0, sum1, avg;
244
245    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
246    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
247
248    var = sse0 + sse1;
249    avg = sum0 + sum1;
250    *sse = var;
251    return (var - ((avg * avg) >> 7));
252
253}
254
255unsigned int vp8_sub_pixel_variance4x4_wmt
256(
257    const unsigned char  *src_ptr,
258    int  src_pixels_per_line,
259    int  xoffset,
260    int  yoffset,
261    const unsigned char *dst_ptr,
262    int dst_pixels_per_line,
263    unsigned int *sse
264)
265{
266    int xsum;
267    unsigned int xxsum;
268    vp8_filter_block2d_bil4x4_var_mmx(
269        src_ptr, src_pixels_per_line,
270        dst_ptr, dst_pixels_per_line,
271        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
272        &xsum, &xxsum
273    );
274    *sse = xxsum;
275    return (xxsum - ((xsum * xsum) >> 4));
276}
277
278
279unsigned int vp8_sub_pixel_variance8x8_wmt
280(
281    const unsigned char  *src_ptr,
282    int  src_pixels_per_line,
283    int  xoffset,
284    int  yoffset,
285    const unsigned char *dst_ptr,
286    int dst_pixels_per_line,
287    unsigned int *sse
288)
289{
290    int xsum;
291    unsigned int xxsum;
292
293    if (xoffset == 4 && yoffset == 0)
294    {
295        vp8_half_horiz_variance8x_h_sse2(
296            src_ptr, src_pixels_per_line,
297            dst_ptr, dst_pixels_per_line, 8,
298            &xsum, &xxsum);
299    }
300    else if (xoffset == 0 && yoffset == 4)
301    {
302        vp8_half_vert_variance8x_h_sse2(
303            src_ptr, src_pixels_per_line,
304            dst_ptr, dst_pixels_per_line, 8,
305            &xsum, &xxsum);
306    }
307    else if (xoffset == 4 && yoffset == 4)
308    {
309        vp8_half_horiz_vert_variance8x_h_sse2(
310            src_ptr, src_pixels_per_line,
311            dst_ptr, dst_pixels_per_line, 8,
312            &xsum, &xxsum);
313    }
314    else
315    {
316        vp8_filter_block2d_bil_var_sse2(
317            src_ptr, src_pixels_per_line,
318            dst_ptr, dst_pixels_per_line, 8,
319            xoffset, yoffset,
320            &xsum, &xxsum);
321    }
322
323    *sse = xxsum;
324    return (xxsum - ((xsum * xsum) >> 6));
325}
326
327unsigned int vp8_sub_pixel_variance16x16_wmt
328(
329    const unsigned char  *src_ptr,
330    int  src_pixels_per_line,
331    int  xoffset,
332    int  yoffset,
333    const unsigned char *dst_ptr,
334    int dst_pixels_per_line,
335    unsigned int *sse
336)
337{
338    int xsum0, xsum1;
339    unsigned int xxsum0, xxsum1;
340
341
342    // note we could avoid these if statements if the calling function
343    // just called the appropriate functions inside.
344    if (xoffset == 4 && yoffset == 0)
345    {
346        vp8_half_horiz_variance16x_h_sse2(
347            src_ptr, src_pixels_per_line,
348            dst_ptr, dst_pixels_per_line, 16,
349            &xsum0, &xxsum0);
350    }
351    else if (xoffset == 0 && yoffset == 4)
352    {
353        vp8_half_vert_variance16x_h_sse2(
354            src_ptr, src_pixels_per_line,
355            dst_ptr, dst_pixels_per_line, 16,
356            &xsum0, &xxsum0);
357    }
358    else if (xoffset == 4 && yoffset == 4)
359    {
360        vp8_half_horiz_vert_variance16x_h_sse2(
361            src_ptr, src_pixels_per_line,
362            dst_ptr, dst_pixels_per_line, 16,
363            &xsum0, &xxsum0);
364    }
365    else
366    {
367        vp8_filter_block2d_bil_var_sse2(
368            src_ptr, src_pixels_per_line,
369            dst_ptr, dst_pixels_per_line, 16,
370            xoffset, yoffset,
371            &xsum0, &xxsum0
372        );
373
374        vp8_filter_block2d_bil_var_sse2(
375            src_ptr + 8, src_pixels_per_line,
376            dst_ptr + 8, dst_pixels_per_line, 16,
377            xoffset, yoffset,
378            &xsum1, &xxsum1
379        );
380        xsum0 += xsum1;
381        xxsum0 += xxsum1;
382    }
383
384    *sse = xxsum0;
385    return (xxsum0 - ((xsum0 * xsum0) >> 8));
386}
387
388unsigned int vp8_sub_pixel_mse16x16_wmt(
389    const unsigned char  *src_ptr,
390    int  src_pixels_per_line,
391    int  xoffset,
392    int  yoffset,
393    const unsigned char *dst_ptr,
394    int dst_pixels_per_line,
395    unsigned int *sse
396)
397{
398    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
399    return *sse;
400}
401
402unsigned int vp8_sub_pixel_variance16x8_wmt
403(
404    const unsigned char  *src_ptr,
405    int  src_pixels_per_line,
406    int  xoffset,
407    int  yoffset,
408    const unsigned char *dst_ptr,
409    int dst_pixels_per_line,
410    unsigned int *sse
411
412)
413{
414    int xsum0, xsum1;
415    unsigned int xxsum0, xxsum1;
416
417    if (xoffset == 4 && yoffset == 0)
418    {
419        vp8_half_horiz_variance16x_h_sse2(
420            src_ptr, src_pixels_per_line,
421            dst_ptr, dst_pixels_per_line, 8,
422            &xsum0, &xxsum0);
423    }
424    else if (xoffset == 0 && yoffset == 4)
425    {
426        vp8_half_vert_variance16x_h_sse2(
427            src_ptr, src_pixels_per_line,
428            dst_ptr, dst_pixels_per_line, 8,
429            &xsum0, &xxsum0);
430    }
431    else if (xoffset == 4 && yoffset == 4)
432    {
433        vp8_half_horiz_vert_variance16x_h_sse2(
434            src_ptr, src_pixels_per_line,
435            dst_ptr, dst_pixels_per_line, 8,
436            &xsum0, &xxsum0);
437    }
438    else
439    {
440        vp8_filter_block2d_bil_var_sse2(
441            src_ptr, src_pixels_per_line,
442            dst_ptr, dst_pixels_per_line, 8,
443            xoffset, yoffset,
444            &xsum0, &xxsum0);
445
446        vp8_filter_block2d_bil_var_sse2(
447            src_ptr + 8, src_pixels_per_line,
448            dst_ptr + 8, dst_pixels_per_line, 8,
449            xoffset, yoffset,
450            &xsum1, &xxsum1);
451        xsum0 += xsum1;
452        xxsum0 += xxsum1;
453    }
454
455    *sse = xxsum0;
456    return (xxsum0 - ((xsum0 * xsum0) >> 7));
457}
458
459unsigned int vp8_sub_pixel_variance8x16_wmt
460(
461    const unsigned char  *src_ptr,
462    int  src_pixels_per_line,
463    int  xoffset,
464    int  yoffset,
465    const unsigned char *dst_ptr,
466    int dst_pixels_per_line,
467    unsigned int *sse
468)
469{
470    int xsum;
471    unsigned int xxsum;
472
473    if (xoffset == 4 && yoffset == 0)
474    {
475        vp8_half_horiz_variance8x_h_sse2(
476            src_ptr, src_pixels_per_line,
477            dst_ptr, dst_pixels_per_line, 16,
478            &xsum, &xxsum);
479    }
480    else if (xoffset == 0 && yoffset == 4)
481    {
482        vp8_half_vert_variance8x_h_sse2(
483            src_ptr, src_pixels_per_line,
484            dst_ptr, dst_pixels_per_line, 16,
485            &xsum, &xxsum);
486    }
487    else if (xoffset == 4 && yoffset == 4)
488    {
489        vp8_half_horiz_vert_variance8x_h_sse2(
490            src_ptr, src_pixels_per_line,
491            dst_ptr, dst_pixels_per_line, 16,
492            &xsum, &xxsum);
493    }
494    else
495    {
496        vp8_filter_block2d_bil_var_sse2(
497            src_ptr, src_pixels_per_line,
498            dst_ptr, dst_pixels_per_line, 16,
499            xoffset, yoffset,
500            &xsum, &xxsum);
501    }
502
503    *sse = xxsum;
504    return (xxsum - ((xsum * xsum) >> 7));
505}
506
507
508unsigned int vp8_variance_halfpixvar16x16_h_wmt(
509    const unsigned char *src_ptr,
510    int  src_pixels_per_line,
511    const unsigned char *dst_ptr,
512    int  dst_pixels_per_line,
513    unsigned int *sse)
514{
515    int xsum0;
516    unsigned int xxsum0;
517
518    vp8_half_horiz_variance16x_h_sse2(
519        src_ptr, src_pixels_per_line,
520        dst_ptr, dst_pixels_per_line, 16,
521        &xsum0, &xxsum0);
522
523    *sse = xxsum0;
524    return (xxsum0 - ((xsum0 * xsum0) >> 8));
525}
526
527
528unsigned int vp8_variance_halfpixvar16x16_v_wmt(
529    const unsigned char *src_ptr,
530    int  src_pixels_per_line,
531    const unsigned char *dst_ptr,
532    int  dst_pixels_per_line,
533    unsigned int *sse)
534{
535    int xsum0;
536    unsigned int xxsum0;
537    vp8_half_vert_variance16x_h_sse2(
538        src_ptr, src_pixels_per_line,
539        dst_ptr, dst_pixels_per_line, 16,
540        &xsum0, &xxsum0);
541
542    *sse = xxsum0;
543    return (xxsum0 - ((xsum0 * xsum0) >> 8));
544}
545
546
547unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
548    const unsigned char *src_ptr,
549    int  src_pixels_per_line,
550    const unsigned char *dst_ptr,
551    int  dst_pixels_per_line,
552    unsigned int *sse)
553{
554    int xsum0;
555    unsigned int xxsum0;
556
557    vp8_half_horiz_vert_variance16x_h_sse2(
558        src_ptr, src_pixels_per_line,
559        dst_ptr, dst_pixels_per_line, 16,
560        &xsum0, &xxsum0);
561
562    *sse = xxsum0;
563    return (xxsum0 - ((xsum0 * xsum0) >> 8));
564}
565