1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12
13#include "vp9/encoder/vp9_variance.h"
14#include "vp9/common/vp9_pragmas.h"
15#include "vpx_ports/mem.h"
16
17extern unsigned int vp9_get4x4var_mmx
18(
19  const unsigned char *src_ptr,
20  int  source_stride,
21  const unsigned char *ref_ptr,
22  int  recon_stride,
23  unsigned int *SSE,
24  int *Sum
25);
26
27unsigned int vp9_get16x16var_sse2
28(
29  const unsigned char *src_ptr,
30  int source_stride,
31  const unsigned char *ref_ptr,
32  int recon_stride,
33  unsigned int *SSE,
34  int *Sum
35);
36unsigned int vp9_get8x8var_sse2
37(
38  const unsigned char *src_ptr,
39  int source_stride,
40  const unsigned char *ref_ptr,
41  int recon_stride,
42  unsigned int *SSE,
43  int *Sum
44);
45void vp9_half_horiz_vert_variance8x_h_sse2
46(
47  const unsigned char *ref_ptr,
48  int ref_pixels_per_line,
49  const unsigned char *src_ptr,
50  int src_pixels_per_line,
51  unsigned int Height,
52  int *sum,
53  unsigned int *sumsquared
54);
55void vp9_half_horiz_vert_variance16x_h_sse2
56(
57  const unsigned char *ref_ptr,
58  int ref_pixels_per_line,
59  const unsigned char *src_ptr,
60  int src_pixels_per_line,
61  unsigned int Height,
62  int *sum,
63  unsigned int *sumsquared
64);
65void vp9_half_horiz_variance8x_h_sse2
66(
67  const unsigned char *ref_ptr,
68  int ref_pixels_per_line,
69  const unsigned char *src_ptr,
70  int src_pixels_per_line,
71  unsigned int Height,
72  int *sum,
73  unsigned int *sumsquared
74);
75void vp9_half_horiz_variance16x_h_sse2
76(
77  const unsigned char *ref_ptr,
78  int ref_pixels_per_line,
79  const unsigned char *src_ptr,
80  int src_pixels_per_line,
81  unsigned int Height,
82  int *sum,
83  unsigned int *sumsquared
84);
85void vp9_half_vert_variance8x_h_sse2
86(
87  const unsigned char *ref_ptr,
88  int ref_pixels_per_line,
89  const unsigned char *src_ptr,
90  int src_pixels_per_line,
91  unsigned int Height,
92  int *sum,
93  unsigned int *sumsquared
94);
95void vp9_half_vert_variance16x_h_sse2
96(
97  const unsigned char *ref_ptr,
98  int ref_pixels_per_line,
99  const unsigned char *src_ptr,
100  int src_pixels_per_line,
101  unsigned int Height,
102  int *sum,
103  unsigned int *sumsquared
104);
105
106typedef unsigned int (*get_var_sse2) (
107  const unsigned char *src_ptr,
108  int source_stride,
109  const unsigned char *ref_ptr,
110  int recon_stride,
111  unsigned int *SSE,
112  int *Sum
113);
114
115static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
116                        const unsigned char *ref_ptr, int  recon_stride,
117                        int  w, int  h, unsigned int *sse, int *sum,
118                        get_var_sse2 var_fn, int block_size) {
119  unsigned int sse0;
120  int sum0;
121  int i, j;
122
123  *sse = 0;
124  *sum = 0;
125
126  for (i = 0; i < h; i += block_size) {
127    for (j = 0; j < w; j += block_size) {
128      var_fn(src_ptr + source_stride * i + j, source_stride,
129             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
130      *sse += sse0;
131      *sum += sum0;
132    }
133  }
134}
135
136unsigned int vp9_variance4x4_sse2(
137  const unsigned char *src_ptr,
138  int  source_stride,
139  const unsigned char *ref_ptr,
140  int  recon_stride,
141  unsigned int *sse) {
142  unsigned int var;
143  int avg;
144
145  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
146                  &var, &avg, vp9_get4x4var_mmx, 4);
147  *sse = var;
148  return (var - (((unsigned int)avg * avg) >> 4));
149}
150
151unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
152                                  int  source_stride,
153                                  const uint8_t *ref_ptr,
154                                  int  recon_stride,
155                                  unsigned int *sse) {
156  unsigned int var;
157  int avg;
158
159  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
160                  &var, &avg, vp9_get4x4var_mmx, 4);
161  *sse = var;
162  return (var - (((unsigned int)avg * avg) >> 5));
163}
164
165unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
166                                  int  source_stride,
167                                  const uint8_t *ref_ptr,
168                                  int  recon_stride,
169                                  unsigned int *sse) {
170  unsigned int var;
171  int avg;
172
173  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
174                  &var, &avg, vp9_get4x4var_mmx, 4);
175  *sse = var;
176  return (var - (((unsigned int)avg * avg) >> 5));
177}
178
179unsigned int vp9_variance8x8_sse2
180(
181  const unsigned char *src_ptr,
182  int  source_stride,
183  const unsigned char *ref_ptr,
184  int  recon_stride,
185  unsigned int *sse) {
186  unsigned int var;
187  int avg;
188
189  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
190                  &var, &avg, vp9_get8x8var_sse2, 8);
191  *sse = var;
192  return (var - (((unsigned int)avg * avg) >> 6));
193}
194
195unsigned int vp9_variance16x8_sse2
196(
197  const unsigned char *src_ptr,
198  int  source_stride,
199  const unsigned char *ref_ptr,
200  int  recon_stride,
201  unsigned int *sse) {
202  unsigned int var;
203  int avg;
204
205  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
206                  &var, &avg, vp9_get8x8var_sse2, 8);
207  *sse = var;
208  return (var - (((unsigned int)avg * avg) >> 7));
209}
210
211unsigned int vp9_variance8x16_sse2
212(
213  const unsigned char *src_ptr,
214  int  source_stride,
215  const unsigned char *ref_ptr,
216  int  recon_stride,
217  unsigned int *sse) {
218  unsigned int var;
219  int avg;
220
221  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
222                &var, &avg, vp9_get8x8var_sse2, 8);
223  *sse = var;
224  return (var - (((unsigned int)avg * avg) >> 7));
225}
226
227unsigned int vp9_variance16x16_sse2
228(
229  const unsigned char *src_ptr,
230  int  source_stride,
231  const unsigned char *ref_ptr,
232  int  recon_stride,
233  unsigned int *sse) {
234  unsigned int var;
235  int avg;
236
237  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
238                &var, &avg, vp9_get16x16var_sse2, 16);
239  *sse = var;
240  return (var - (((unsigned int)avg * avg) >> 8));
241}
242
243unsigned int vp9_mse16x16_sse2(
244  const unsigned char *src_ptr,
245  int  source_stride,
246  const unsigned char *ref_ptr,
247  int  recon_stride,
248  unsigned int *sse) {
249  unsigned int sse0;
250  int sum0;
251  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
252                       &sum0);
253  *sse = sse0;
254  return sse0;
255}
256
257unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
258                                    int  source_stride,
259                                    const uint8_t *ref_ptr,
260                                    int  recon_stride,
261                                    unsigned int *sse) {
262  unsigned int var;
263  int avg;
264
265  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
266                &var, &avg, vp9_get16x16var_sse2, 16);
267  *sse = var;
268  return (var - (((int64_t)avg * avg) >> 10));
269}
270
271unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
272                                    int  source_stride,
273                                    const uint8_t *ref_ptr,
274                                    int  recon_stride,
275                                    unsigned int *sse) {
276  unsigned int var;
277  int avg;
278
279  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
280                &var, &avg, vp9_get16x16var_sse2, 16);
281  *sse = var;
282  return (var - (((int64_t)avg * avg) >> 9));
283}
284
285unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
286                                    int  source_stride,
287                                    const uint8_t *ref_ptr,
288                                    int  recon_stride,
289                                    unsigned int *sse) {
290  unsigned int var;
291  int avg;
292
293  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
294                &var, &avg, vp9_get16x16var_sse2, 16);
295  *sse = var;
296  return (var - (((int64_t)avg * avg) >> 9));
297}
298
299unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
300                                    int  source_stride,
301                                    const uint8_t *ref_ptr,
302                                    int  recon_stride,
303                                    unsigned int *sse) {
304  unsigned int var;
305  int avg;
306
307  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
308                &var, &avg, vp9_get16x16var_sse2, 16);
309  *sse = var;
310  return (var - (((int64_t)avg * avg) >> 12));
311}
312
313unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
314                                    int  source_stride,
315                                    const uint8_t *ref_ptr,
316                                    int  recon_stride,
317                                    unsigned int *sse) {
318  unsigned int var;
319  int avg;
320
321  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
322                &var, &avg, vp9_get16x16var_sse2, 16);
323  *sse = var;
324  return (var - (((int64_t)avg * avg) >> 11));
325}
326
327unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
328                                    int  source_stride,
329                                    const uint8_t *ref_ptr,
330                                    int  recon_stride,
331                                    unsigned int *sse) {
332  unsigned int var;
333  int avg;
334
335  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
336                &var, &avg, vp9_get16x16var_sse2, 16);
337  *sse = var;
338  return (var - (((int64_t)avg * avg) >> 11));
339}
340
341#define DECL(w, opt) \
342int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
343                                        ptrdiff_t src_stride, \
344                                        int x_offset, int y_offset, \
345                                        const uint8_t *dst, \
346                                        ptrdiff_t dst_stride, \
347                                        int height, unsigned int *sse)
348#define DECLS(opt1, opt2) \
349DECL(4, opt2); \
350DECL(8, opt1); \
351DECL(16, opt1)
352
353DECLS(sse2, sse);
354DECLS(ssse3, ssse3);
355#undef DECLS
356#undef DECL
357
358#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
359unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
360                                                     int src_stride, \
361                                                     int x_offset, \
362                                                     int y_offset, \
363                                                     const uint8_t *dst, \
364                                                     int dst_stride, \
365                                                     unsigned int *sse_ptr) { \
366  unsigned int sse; \
367  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
368                                                y_offset, dst, dst_stride, \
369                                                h, &sse); \
370  if (w > wf) { \
371    unsigned int sse2; \
372    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
373                                                   x_offset, y_offset, \
374                                                   dst + 16, dst_stride, \
375                                                   h, &sse2); \
376    se += se2; \
377    sse += sse2; \
378    if (w > wf * 2) { \
379      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
380                                                 x_offset, y_offset, \
381                                                 dst + 32, dst_stride, \
382                                                 h, &sse2); \
383      se += se2; \
384      sse += sse2; \
385      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
386                                                 x_offset, y_offset, \
387                                                 dst + 48, dst_stride, \
388                                                 h, &sse2); \
389      se += se2; \
390      sse += sse2; \
391    } \
392  } \
393  *sse_ptr = sse; \
394  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
395}
396
397#define FNS(opt1, opt2) \
398FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
399FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
400FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
401FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
402FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
403FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
404FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
405FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
406FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
407FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
408FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
409FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
410FN(4,   4,  4, 2, 2, opt2, (unsigned int))
411
412FNS(sse2, sse);
413FNS(ssse3, ssse3);
414
415#undef FNS
416#undef FN
417
418#define DECL(w, opt) \
419int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
420                                            ptrdiff_t src_stride, \
421                                            int x_offset, int y_offset, \
422                                            const uint8_t *dst, \
423                                            ptrdiff_t dst_stride, \
424                                            const uint8_t *sec, \
425                                            ptrdiff_t sec_stride, \
426                                            int height, unsigned int *sse)
427#define DECLS(opt1, opt2) \
428DECL(4, opt2); \
429DECL(8, opt1); \
430DECL(16, opt1)
431
432DECLS(sse2, sse);
433DECLS(ssse3, ssse3);
434#undef DECL
435#undef DECLS
436
437#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
438unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
439                                                         int src_stride, \
440                                                         int x_offset, \
441                                                         int y_offset, \
442                                                         const uint8_t *dst, \
443                                                         int dst_stride, \
444                                                         unsigned int *sseptr, \
445                                                         const uint8_t *sec) { \
446  unsigned int sse; \
447  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
448                                                    y_offset, dst, dst_stride, \
449                                                    sec, w, h, &sse); \
450  if (w > wf) { \
451    unsigned int sse2; \
452    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
453                                                       x_offset, y_offset, \
454                                                       dst + 16, dst_stride, \
455                                                       sec + 16, w, h, &sse2); \
456    se += se2; \
457    sse += sse2; \
458    if (w > wf * 2) { \
459      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
460                                                     x_offset, y_offset, \
461                                                     dst + 32, dst_stride, \
462                                                     sec + 32, w, h, &sse2); \
463      se += se2; \
464      sse += sse2; \
465      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
466                                                     x_offset, y_offset, \
467                                                     dst + 48, dst_stride, \
468                                                     sec + 48, w, h, &sse2); \
469      se += se2; \
470      sse += sse2; \
471    } \
472  } \
473  *sseptr = sse; \
474  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
475}
476
477#define FNS(opt1, opt2) \
478FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
479FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
480FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
481FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
482FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
483FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
484FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
485FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
486FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
487FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
488FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
489FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
490FN(4,   4,  4, 2, 2, opt2, (unsigned int))
491
492FNS(sse2, sse);
493FNS(ssse3, ssse3);
494
495#undef FNS
496#undef FN
497
498unsigned int vp9_variance_halfpixvar16x16_h_sse2(
499  const unsigned char *src_ptr,
500  int  src_pixels_per_line,
501  const unsigned char *dst_ptr,
502  int  dst_pixels_per_line,
503  unsigned int *sse) {
504  int xsum0;
505  unsigned int xxsum0;
506
507  vp9_half_horiz_variance16x_h_sse2(
508    src_ptr, src_pixels_per_line,
509    dst_ptr, dst_pixels_per_line, 16,
510    &xsum0, &xxsum0);
511
512  *sse = xxsum0;
513  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
514}
515
516
517unsigned int vp9_variance_halfpixvar16x16_v_sse2(
518  const unsigned char *src_ptr,
519  int  src_pixels_per_line,
520  const unsigned char *dst_ptr,
521  int  dst_pixels_per_line,
522  unsigned int *sse) {
523  int xsum0;
524  unsigned int xxsum0;
525  vp9_half_vert_variance16x_h_sse2(
526    src_ptr, src_pixels_per_line,
527    dst_ptr, dst_pixels_per_line, 16,
528    &xsum0, &xxsum0);
529
530  *sse = xxsum0;
531  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
532}
533
534
535unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
536  const unsigned char *src_ptr,
537  int  src_pixels_per_line,
538  const unsigned char *dst_ptr,
539  int  dst_pixels_per_line,
540  unsigned int *sse) {
541  int xsum0;
542  unsigned int xxsum0;
543
544  vp9_half_horiz_vert_variance16x_h_sse2(
545    src_ptr, src_pixels_per_line,
546    dst_ptr, dst_pixels_per_line, 16,
547    &xsum0, &xxsum0);
548
549  *sse = xxsum0;
550  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
551}
552