1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./vpx_config.h"
12
13#include "vp9/encoder/vp9_variance.h"
14#include "vpx_ports/mem.h"
15
16typedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
17                                       const unsigned char *ref, int ref_stride,
18                                       unsigned int *sse, int *sum);
19
20unsigned int vp9_get4x4var_mmx(const unsigned char *src, int src_stride,
21                               const unsigned char *ref, int ref_stride,
22                               unsigned int *sse, int *sum);
23
24
25unsigned int vp9_get8x8var_sse2(const unsigned char *src, int src_stride,
26                                const unsigned char *ref, int ref_stride,
27                                unsigned int *sse, int *sum);
28
29unsigned int vp9_get16x16var_sse2(const unsigned char *src, int src_stride,
30                                  const unsigned char *ref, int ref_stride,
31                                  unsigned int *sse, int *sum);
32
33static void variance_sse2(const unsigned char *src, int src_stride,
34                          const unsigned char *ref, int ref_stride,
35                          int w, int h, unsigned int *sse, int *sum,
36                          variance_fn_t var_fn, int block_size) {
37  int i, j;
38
39  *sse = 0;
40  *sum = 0;
41
42  for (i = 0; i < h; i += block_size) {
43    for (j = 0; j < w; j += block_size) {
44      unsigned int sse0;
45      int sum0;
46      var_fn(src + src_stride * i + j, src_stride,
47             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
48      *sse += sse0;
49      *sum += sum0;
50    }
51  }
52}
53
54unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
55                                  const unsigned char *ref, int ref_stride,
56                                  unsigned int *sse) {
57  int sum;
58  variance_sse2(src, src_stride, ref, ref_stride, 4, 4,
59                sse, &sum, vp9_get4x4var_mmx, 4);
60  return *sse - (((unsigned int)sum * sum) >> 4);
61}
62
63unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
64                                  const uint8_t *ref, int ref_stride,
65                                  unsigned int *sse) {
66  int sum;
67  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
68                sse, &sum, vp9_get4x4var_mmx, 4);
69  return *sse - (((unsigned int)sum * sum) >> 5);
70}
71
72unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
73                                  const uint8_t *ref, int ref_stride,
74                                  unsigned int *sse) {
75  int sum;
76  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
77                sse, &sum, vp9_get4x4var_mmx, 4);
78  return *sse - (((unsigned int)sum * sum) >> 5);
79}
80
81unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
82                                  const unsigned char *ref, int ref_stride,
83                                  unsigned int *sse) {
84  int sum;
85  variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
86                sse, &sum, vp9_get8x8var_sse2, 8);
87  return *sse - (((unsigned int)sum * sum) >> 6);
88}
89
90unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
91                                   const unsigned char *ref, int ref_stride,
92                                   unsigned int *sse) {
93  int sum;
94  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
95                sse, &sum, vp9_get8x8var_sse2, 8);
96  return *sse - (((unsigned int)sum * sum) >> 7);
97}
98
99unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
100                                   const unsigned char *ref, int ref_stride,
101                                   unsigned int *sse) {
102  int sum;
103  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
104                sse, &sum, vp9_get8x8var_sse2, 8);
105  return *sse - (((unsigned int)sum * sum) >> 7);
106}
107
108unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
109                                    const unsigned char *ref, int ref_stride,
110                                    unsigned int *sse) {
111  int sum;
112  variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
113                sse, &sum, vp9_get16x16var_sse2, 16);
114  return *sse - (((unsigned int)sum * sum) >> 8);
115}
116
117unsigned int vp9_mse16x16_sse2(const unsigned char *src, int src_stride,
118                               const unsigned char *ref, int ref_stride,
119                               unsigned int *sse) {
120  int sum;
121  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
122  return *sse;
123}
124
125unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
126                                    const uint8_t *ref, int ref_stride,
127                                    unsigned int *sse) {
128  int sum;
129  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
130                sse, &sum, vp9_get16x16var_sse2, 16);
131  return *sse - (((int64_t)sum * sum) >> 10);
132}
133
134unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
135                                    const uint8_t *ref, int ref_stride,
136                                    unsigned int *sse) {
137  int sum;
138  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
139                sse, &sum, vp9_get16x16var_sse2, 16);
140  return *sse - (((int64_t)sum * sum) >> 9);
141}
142
143unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
144                                    const uint8_t *ref, int ref_stride,
145                                    unsigned int *sse) {
146  int sum;
147  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
148                sse, &sum, vp9_get16x16var_sse2, 16);
149  return *sse - (((int64_t)sum * sum) >> 9);
150}
151
152unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
153                                    const uint8_t *ref, int ref_stride,
154                                    unsigned int *sse) {
155  int sum;
156  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
157                sse, &sum, vp9_get16x16var_sse2, 16);
158  return *sse - (((int64_t)sum * sum) >> 12);
159}
160
161unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
162                                    const uint8_t *ref, int ref_stride,
163                                    unsigned int *sse) {
164  int sum;
165  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
166                sse, &sum, vp9_get16x16var_sse2, 16);
167  return *sse - (((int64_t)sum * sum) >> 11);
168}
169
170unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
171                                    const uint8_t *ref, int ref_stride,
172                                    unsigned int *sse) {
173  int sum;
174  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
175                sse, &sum, vp9_get16x16var_sse2, 16);
176  return *sse - (((int64_t)sum * sum) >> 11);
177}
178
179#define DECL(w, opt) \
180int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
181                                        ptrdiff_t src_stride, \
182                                        int x_offset, int y_offset, \
183                                        const uint8_t *dst, \
184                                        ptrdiff_t dst_stride, \
185                                        int height, unsigned int *sse)
186#define DECLS(opt1, opt2) \
187DECL(4, opt2); \
188DECL(8, opt1); \
189DECL(16, opt1)
190
191DECLS(sse2, sse);
192DECLS(ssse3, ssse3);
193#undef DECLS
194#undef DECL
195
196#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
197unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
198                                                     int src_stride, \
199                                                     int x_offset, \
200                                                     int y_offset, \
201                                                     const uint8_t *dst, \
202                                                     int dst_stride, \
203                                                     unsigned int *sse_ptr) { \
204  unsigned int sse; \
205  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
206                                                y_offset, dst, dst_stride, \
207                                                h, &sse); \
208  if (w > wf) { \
209    unsigned int sse2; \
210    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
211                                                   x_offset, y_offset, \
212                                                   dst + 16, dst_stride, \
213                                                   h, &sse2); \
214    se += se2; \
215    sse += sse2; \
216    if (w > wf * 2) { \
217      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
218                                                 x_offset, y_offset, \
219                                                 dst + 32, dst_stride, \
220                                                 h, &sse2); \
221      se += se2; \
222      sse += sse2; \
223      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
224                                                 x_offset, y_offset, \
225                                                 dst + 48, dst_stride, \
226                                                 h, &sse2); \
227      se += se2; \
228      sse += sse2; \
229    } \
230  } \
231  *sse_ptr = sse; \
232  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
233}
234
235#define FNS(opt1, opt2) \
236FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
237FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
238FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
239FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
240FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
241FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
242FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
243FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
244FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
245FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
246FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
247FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
248FN(4,   4,  4, 2, 2, opt2, (unsigned int))
249
250FNS(sse2, sse);
251FNS(ssse3, ssse3);
252
253#undef FNS
254#undef FN
255
256#define DECL(w, opt) \
257int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
258                                            ptrdiff_t src_stride, \
259                                            int x_offset, int y_offset, \
260                                            const uint8_t *dst, \
261                                            ptrdiff_t dst_stride, \
262                                            const uint8_t *sec, \
263                                            ptrdiff_t sec_stride, \
264                                            int height, unsigned int *sse)
265#define DECLS(opt1, opt2) \
266DECL(4, opt2); \
267DECL(8, opt1); \
268DECL(16, opt1)
269
270DECLS(sse2, sse);
271DECLS(ssse3, ssse3);
272#undef DECL
273#undef DECLS
274
275#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
276unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
277                                                         int src_stride, \
278                                                         int x_offset, \
279                                                         int y_offset, \
280                                                         const uint8_t *dst, \
281                                                         int dst_stride, \
282                                                         unsigned int *sseptr, \
283                                                         const uint8_t *sec) { \
284  unsigned int sse; \
285  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
286                                                    y_offset, dst, dst_stride, \
287                                                    sec, w, h, &sse); \
288  if (w > wf) { \
289    unsigned int sse2; \
290    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
291                                                       x_offset, y_offset, \
292                                                       dst + 16, dst_stride, \
293                                                       sec + 16, w, h, &sse2); \
294    se += se2; \
295    sse += sse2; \
296    if (w > wf * 2) { \
297      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
298                                                     x_offset, y_offset, \
299                                                     dst + 32, dst_stride, \
300                                                     sec + 32, w, h, &sse2); \
301      se += se2; \
302      sse += sse2; \
303      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
304                                                     x_offset, y_offset, \
305                                                     dst + 48, dst_stride, \
306                                                     sec + 48, w, h, &sse2); \
307      se += se2; \
308      sse += sse2; \
309    } \
310  } \
311  *sseptr = sse; \
312  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
313}
314
315#define FNS(opt1, opt2) \
316FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
317FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
318FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
319FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
320FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
321FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
322FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
323FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
324FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
325FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
326FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
327FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
328FN(4,   4,  4, 2, 2, opt2, (unsigned int))
329
330FNS(sse2, sse);
331FNS(ssse3, ssse3);
332
333#undef FNS
334#undef FN
335