1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "vpx_config.h"
12#include "vp8/common/variance.h"
13#include "vp8/common/pragmas.h"
14#include "vpx_ports/mem.h"
15#include "vp8/common/x86/filter_x86.h"
16
17extern void filter_block1d_h6_mmx
18(
19    const unsigned char *src_ptr,
20    unsigned short *output_ptr,
21    unsigned int src_pixels_per_line,
22    unsigned int pixel_step,
23    unsigned int output_height,
24    unsigned int output_width,
25    short *filter
26);
27extern void filter_block1d_v6_mmx
28(
29    const short *src_ptr,
30    unsigned char *output_ptr,
31    unsigned int pixels_per_line,
32    unsigned int pixel_step,
33    unsigned int output_height,
34    unsigned int output_width,
35    short *filter
36);
37
38extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
39extern unsigned int vp8_get8x8var_mmx
40(
41    const unsigned char *src_ptr,
42    int  source_stride,
43    const unsigned char *ref_ptr,
44    int  recon_stride,
45    unsigned int *SSE,
46    int *Sum
47);
48extern unsigned int vp8_get4x4var_mmx
49(
50    const unsigned char *src_ptr,
51    int  source_stride,
52    const unsigned char *ref_ptr,
53    int  recon_stride,
54    unsigned int *SSE,
55    int *Sum
56);
57extern void vp8_filter_block2d_bil4x4_var_mmx
58(
59    const unsigned char *ref_ptr,
60    int ref_pixels_per_line,
61    const unsigned char *src_ptr,
62    int src_pixels_per_line,
63    const short *HFilter,
64    const short *VFilter,
65    int *sum,
66    unsigned int *sumsquared
67);
68extern void vp8_filter_block2d_bil_var_mmx
69(
70    const unsigned char *ref_ptr,
71    int ref_pixels_per_line,
72    const unsigned char *src_ptr,
73    int src_pixels_per_line,
74    unsigned int Height,
75    const short *HFilter,
76    const short *VFilter,
77    int *sum,
78    unsigned int *sumsquared
79);
80
81
82unsigned int vp8_variance4x4_mmx(
83    const unsigned char *src_ptr,
84    int  source_stride,
85    const unsigned char *ref_ptr,
86    int  recon_stride,
87    unsigned int *sse)
88{
89    unsigned int var;
90    int avg;
91
92    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
93    *sse = var;
94    return (var - (((unsigned int)avg * avg) >> 4));
95
96}
97
98unsigned int vp8_variance8x8_mmx(
99    const unsigned char *src_ptr,
100    int  source_stride,
101    const unsigned char *ref_ptr,
102    int  recon_stride,
103    unsigned int *sse)
104{
105    unsigned int var;
106    int avg;
107
108    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
109    *sse = var;
110
111    return (var - (((unsigned int)avg * avg) >> 6));
112
113}
114
115unsigned int vp8_mse16x16_mmx(
116    const unsigned char *src_ptr,
117    int  source_stride,
118    const unsigned char *ref_ptr,
119    int  recon_stride,
120    unsigned int *sse)
121{
122    unsigned int sse0, sse1, sse2, sse3, var;
123    int sum0, sum1, sum2, sum3;
124
125
126    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
127    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
128    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
129    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
130
131    var = sse0 + sse1 + sse2 + sse3;
132    *sse = var;
133    return var;
134}
135
136
137unsigned int vp8_variance16x16_mmx(
138    const unsigned char *src_ptr,
139    int  source_stride,
140    const unsigned char *ref_ptr,
141    int  recon_stride,
142    unsigned int *sse)
143{
144    unsigned int sse0, sse1, sse2, sse3, var;
145    int sum0, sum1, sum2, sum3, avg;
146
147
148    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
149    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
150    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
151    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
152
153    var = sse0 + sse1 + sse2 + sse3;
154    avg = sum0 + sum1 + sum2 + sum3;
155    *sse = var;
156    return (var - (((unsigned int)avg * avg) >> 8));
157}
158
159unsigned int vp8_variance16x8_mmx(
160    const unsigned char *src_ptr,
161    int  source_stride,
162    const unsigned char *ref_ptr,
163    int  recon_stride,
164    unsigned int *sse)
165{
166    unsigned int sse0, sse1, var;
167    int sum0, sum1, avg;
168
169    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
170    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
171
172    var = sse0 + sse1;
173    avg = sum0 + sum1;
174    *sse = var;
175    return (var - (((unsigned int)avg * avg) >> 7));
176
177}
178
179
180unsigned int vp8_variance8x16_mmx(
181    const unsigned char *src_ptr,
182    int  source_stride,
183    const unsigned char *ref_ptr,
184    int  recon_stride,
185    unsigned int *sse)
186{
187    unsigned int sse0, sse1, var;
188    int sum0, sum1, avg;
189
190    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
191    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
192
193    var = sse0 + sse1;
194    avg = sum0 + sum1;
195    *sse = var;
196
197    return (var - (((unsigned int)avg * avg) >> 7));
198
199}
200
201
202unsigned int vp8_sub_pixel_variance4x4_mmx
203(
204    const unsigned char  *src_ptr,
205    int  src_pixels_per_line,
206    int  xoffset,
207    int  yoffset,
208    const unsigned char *dst_ptr,
209    int dst_pixels_per_line,
210    unsigned int *sse)
211
212{
213    int xsum;
214    unsigned int xxsum;
215    vp8_filter_block2d_bil4x4_var_mmx(
216        src_ptr, src_pixels_per_line,
217        dst_ptr, dst_pixels_per_line,
218        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
219        &xsum, &xxsum
220    );
221    *sse = xxsum;
222    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
223}
224
225
226unsigned int vp8_sub_pixel_variance8x8_mmx
227(
228    const unsigned char  *src_ptr,
229    int  src_pixels_per_line,
230    int  xoffset,
231    int  yoffset,
232    const unsigned char *dst_ptr,
233    int dst_pixels_per_line,
234    unsigned int *sse
235)
236{
237
238    int xsum;
239    unsigned int xxsum;
240    vp8_filter_block2d_bil_var_mmx(
241        src_ptr, src_pixels_per_line,
242        dst_ptr, dst_pixels_per_line, 8,
243        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
244        &xsum, &xxsum
245    );
246    *sse = xxsum;
247    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
248}
249
250unsigned int vp8_sub_pixel_variance16x16_mmx
251(
252    const unsigned char  *src_ptr,
253    int  src_pixels_per_line,
254    int  xoffset,
255    int  yoffset,
256    const unsigned char *dst_ptr,
257    int dst_pixels_per_line,
258    unsigned int *sse
259)
260{
261
262    int xsum0, xsum1;
263    unsigned int xxsum0, xxsum1;
264
265
266    vp8_filter_block2d_bil_var_mmx(
267        src_ptr, src_pixels_per_line,
268        dst_ptr, dst_pixels_per_line, 16,
269        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
270        &xsum0, &xxsum0
271    );
272
273
274    vp8_filter_block2d_bil_var_mmx(
275        src_ptr + 8, src_pixels_per_line,
276        dst_ptr + 8, dst_pixels_per_line, 16,
277        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
278        &xsum1, &xxsum1
279    );
280
281    xsum0 += xsum1;
282    xxsum0 += xxsum1;
283
284    *sse = xxsum0;
285    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
286
287
288}
289
290unsigned int vp8_sub_pixel_mse16x16_mmx(
291    const unsigned char  *src_ptr,
292    int  src_pixels_per_line,
293    int  xoffset,
294    int  yoffset,
295    const unsigned char *dst_ptr,
296    int dst_pixels_per_line,
297    unsigned int *sse
298)
299{
300    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
301    return *sse;
302}
303
304unsigned int vp8_sub_pixel_variance16x8_mmx
305(
306    const unsigned char  *src_ptr,
307    int  src_pixels_per_line,
308    int  xoffset,
309    int  yoffset,
310    const unsigned char *dst_ptr,
311    int dst_pixels_per_line,
312    unsigned int *sse
313)
314{
315    int xsum0, xsum1;
316    unsigned int xxsum0, xxsum1;
317
318
319    vp8_filter_block2d_bil_var_mmx(
320        src_ptr, src_pixels_per_line,
321        dst_ptr, dst_pixels_per_line, 8,
322        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
323        &xsum0, &xxsum0
324    );
325
326
327    vp8_filter_block2d_bil_var_mmx(
328        src_ptr + 8, src_pixels_per_line,
329        dst_ptr + 8, dst_pixels_per_line, 8,
330        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
331        &xsum1, &xxsum1
332    );
333
334    xsum0 += xsum1;
335    xxsum0 += xxsum1;
336
337    *sse = xxsum0;
338    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
339}
340
341unsigned int vp8_sub_pixel_variance8x16_mmx
342(
343    const unsigned char  *src_ptr,
344    int  src_pixels_per_line,
345    int  xoffset,
346    int  yoffset,
347    const unsigned char *dst_ptr,
348    int dst_pixels_per_line,
349    unsigned int *sse
350)
351{
352    int xsum;
353    unsigned int xxsum;
354    vp8_filter_block2d_bil_var_mmx(
355        src_ptr, src_pixels_per_line,
356        dst_ptr, dst_pixels_per_line, 16,
357        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
358        &xsum, &xxsum
359    );
360    *sse = xxsum;
361    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
362}
363
364
365unsigned int vp8_variance_halfpixvar16x16_h_mmx(
366    const unsigned char *src_ptr,
367    int  source_stride,
368    const unsigned char *ref_ptr,
369    int  recon_stride,
370    unsigned int *sse)
371{
372    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
373                                           ref_ptr, recon_stride, sse);
374}
375
376
377unsigned int vp8_variance_halfpixvar16x16_v_mmx(
378    const unsigned char *src_ptr,
379    int  source_stride,
380    const unsigned char *ref_ptr,
381    int  recon_stride,
382    unsigned int *sse)
383{
384    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
385                                           ref_ptr, recon_stride, sse);
386}
387
388
389unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
390    const unsigned char *src_ptr,
391    int  source_stride,
392    const unsigned char *ref_ptr,
393    int  recon_stride,
394    unsigned int *sse)
395{
396    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
397                                           ref_ptr, recon_stride, sse);
398}
399