1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10#include "./vpx_config.h"
11
12#include "vpx_ports/mem.h"
13
14typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
15                                       const uint16_t *ref, int ref_stride,
16                                       uint32_t *sse, int *sum);
17
18uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
19                                    const uint16_t *ref, int ref_stride,
20                                    uint32_t *sse, int *sum);
21
22uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
23                                      const uint16_t *ref, int ref_stride,
24                                      uint32_t *sse, int *sum);
25
26static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
27                                   const uint16_t *ref, int ref_stride, int w,
28                                   int h, uint32_t *sse, int *sum,
29                                   high_variance_fn_t var_fn, int block_size) {
30  int i, j;
31
32  *sse = 0;
33  *sum = 0;
34
35  for (i = 0; i < h; i += block_size) {
36    for (j = 0; j < w; j += block_size) {
37      unsigned int sse0;
38      int sum0;
39      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
40             ref_stride, &sse0, &sum0);
41      *sse += sse0;
42      *sum += sum0;
43    }
44  }
45}
46
47static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
48                                    const uint16_t *ref, int ref_stride, int w,
49                                    int h, uint32_t *sse, int *sum,
50                                    high_variance_fn_t var_fn, int block_size) {
51  int i, j;
52  uint64_t sse_long = 0;
53  int32_t sum_long = 0;
54
55  for (i = 0; i < h; i += block_size) {
56    for (j = 0; j < w; j += block_size) {
57      unsigned int sse0;
58      int sum0;
59      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
60             ref_stride, &sse0, &sum0);
61      sse_long += sse0;
62      sum_long += sum0;
63    }
64  }
65  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
66  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
67}
68
69static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
70                                    const uint16_t *ref, int ref_stride, int w,
71                                    int h, uint32_t *sse, int *sum,
72                                    high_variance_fn_t var_fn, int block_size) {
73  int i, j;
74  uint64_t sse_long = 0;
75  int32_t sum_long = 0;
76
77  for (i = 0; i < h; i += block_size) {
78    for (j = 0; j < w; j += block_size) {
79      unsigned int sse0;
80      int sum0;
81      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
82             ref_stride, &sse0, &sum0);
83      sse_long += sse0;
84      sum_long += sum0;
85    }
86  }
87  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
88  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
89}
90
91#define HIGH_GET_VAR(S)                                                       \
92  void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
93                                         const uint8_t *ref8, int ref_stride, \
94                                         uint32_t *sse, int *sum) {           \
95    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
96    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
97    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
98                                       sum);                                  \
99  }                                                                           \
100                                                                              \
101  void vpx_highbd_10_get##S##x##S##var_sse2(                                  \
102      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
103      int ref_stride, uint32_t *sse, int *sum) {                              \
104    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
105    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
106    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
107                                       sum);                                  \
108    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
109    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
110  }                                                                           \
111                                                                              \
112  void vpx_highbd_12_get##S##x##S##var_sse2(                                  \
113      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
114      int ref_stride, uint32_t *sse, int *sum) {                              \
115    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
116    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
117    vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
118                                       sum);                                  \
119    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
120    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
121  }
122
123HIGH_GET_VAR(16);
124HIGH_GET_VAR(8);
125
126#undef HIGH_GET_VAR
127
128#define VAR_FN(w, h, block_size, shift)                                    \
129  uint32_t vpx_highbd_8_variance##w##x##h##_sse2(                          \
130      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
131      int ref_stride, uint32_t *sse) {                                     \
132    int sum;                                                               \
133    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
134    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
135    highbd_8_variance_sse2(                                                \
136        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
137        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
138    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
139  }                                                                        \
140                                                                           \
141  uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
142      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
143      int ref_stride, uint32_t *sse) {                                     \
144    int sum;                                                               \
145    int64_t var;                                                           \
146    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
147    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
148    highbd_10_variance_sse2(                                               \
149        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
150        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
151    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
152    return (var >= 0) ? (uint32_t)var : 0;                                 \
153  }                                                                        \
154                                                                           \
155  uint32_t vpx_highbd_12_variance##w##x##h##_sse2(                         \
156      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
157      int ref_stride, uint32_t *sse) {                                     \
158    int sum;                                                               \
159    int64_t var;                                                           \
160    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
161    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
162    highbd_12_variance_sse2(                                               \
163        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
164        vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
165    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
166    return (var >= 0) ? (uint32_t)var : 0;                                 \
167  }
168
169VAR_FN(64, 64, 16, 12);
170VAR_FN(64, 32, 16, 11);
171VAR_FN(32, 64, 16, 11);
172VAR_FN(32, 32, 16, 10);
173VAR_FN(32, 16, 16, 9);
174VAR_FN(16, 32, 16, 9);
175VAR_FN(16, 16, 16, 8);
176VAR_FN(16, 8, 8, 7);
177VAR_FN(8, 16, 8, 7);
178VAR_FN(8, 8, 8, 6);
179
180#undef VAR_FN
181
182unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
183                                        const uint8_t *ref8, int ref_stride,
184                                        unsigned int *sse) {
185  int sum;
186  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
187  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
188  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
189                         vpx_highbd_calc16x16var_sse2, 16);
190  return *sse;
191}
192
193unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
194                                         const uint8_t *ref8, int ref_stride,
195                                         unsigned int *sse) {
196  int sum;
197  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
198  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
199  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
200                          vpx_highbd_calc16x16var_sse2, 16);
201  return *sse;
202}
203
204unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
205                                         const uint8_t *ref8, int ref_stride,
206                                         unsigned int *sse) {
207  int sum;
208  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
209  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
210  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
211                          vpx_highbd_calc16x16var_sse2, 16);
212  return *sse;
213}
214
215unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
216                                      const uint8_t *ref8, int ref_stride,
217                                      unsigned int *sse) {
218  int sum;
219  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
220  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
221  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
222                         vpx_highbd_calc8x8var_sse2, 8);
223  return *sse;
224}
225
226unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
227                                       const uint8_t *ref8, int ref_stride,
228                                       unsigned int *sse) {
229  int sum;
230  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
231  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
232  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
233                          vpx_highbd_calc8x8var_sse2, 8);
234  return *sse;
235}
236
237unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
238                                       const uint8_t *ref8, int ref_stride,
239                                       unsigned int *sse) {
240  int sum;
241  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
242  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
243  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
244                          vpx_highbd_calc8x8var_sse2, 8);
245  return *sse;
246}
247
248// The 2 unused parameters are place holders for PIC enabled build.
249// These definitions are for functions defined in
250// highbd_subpel_variance_impl_sse2.asm
251#define DECL(w, opt)                                                         \
252  int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
253      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
254      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
255      unsigned int *sse, void *unused0, void *unused);
256#define DECLS(opt) \
257  DECL(8, opt);    \
258  DECL(16, opt)
259
260DECLS(sse2);
261
262#undef DECLS
263#undef DECL
264
265#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
266  uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
267      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
268      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
269    uint32_t sse;                                                              \
270    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
271    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
272    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
273        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
274        NULL);                                                                 \
275    if (w > wf) {                                                              \
276      unsigned int sse2;                                                       \
277      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
278          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
279          &sse2, NULL, NULL);                                                  \
280      se += se2;                                                               \
281      sse += sse2;                                                             \
282      if (w > wf * 2) {                                                        \
283        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
284            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
285            &sse2, NULL, NULL);                                                \
286        se += se2;                                                             \
287        sse += sse2;                                                           \
288        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
289            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
290            &sse2, NULL, NULL);                                                \
291        se += se2;                                                             \
292        sse += sse2;                                                           \
293      }                                                                        \
294    }                                                                          \
295    *sse_ptr = sse;                                                            \
296    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
297  }                                                                            \
298                                                                               \
299  uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
300      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
301      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
302    int64_t var;                                                               \
303    uint32_t sse;                                                              \
304    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
305    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
306    int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
307        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
308        NULL);                                                                 \
309    if (w > wf) {                                                              \
310      uint32_t sse2;                                                           \
311      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
312          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
313          &sse2, NULL, NULL);                                                  \
314      se += se2;                                                               \
315      sse += sse2;                                                             \
316      if (w > wf * 2) {                                                        \
317        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
318            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
319            &sse2, NULL, NULL);                                                \
320        se += se2;                                                             \
321        sse += sse2;                                                           \
322        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
323            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
324            &sse2, NULL, NULL);                                                \
325        se += se2;                                                             \
326        sse += sse2;                                                           \
327      }                                                                        \
328    }                                                                          \
329    se = ROUND_POWER_OF_TWO(se, 2);                                            \
330    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
331    *sse_ptr = sse;                                                            \
332    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
333    return (var >= 0) ? (uint32_t)var : 0;                                     \
334  }                                                                            \
335                                                                               \
336  uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
337      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
338      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
339    int start_row;                                                             \
340    uint32_t sse;                                                              \
341    int se = 0;                                                                \
342    int64_t var;                                                               \
343    uint64_t long_sse = 0;                                                     \
344    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
345    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
346    for (start_row = 0; start_row < h; start_row += 16) {                      \
347      uint32_t sse2;                                                           \
348      int height = h - start_row < 16 ? h - start_row : 16;                    \
349      int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
350          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
351          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
352          NULL);                                                               \
353      se += se2;                                                               \
354      long_sse += sse2;                                                        \
355      if (w > wf) {                                                            \
356        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
357            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
358            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
359            &sse2, NULL, NULL);                                                \
360        se += se2;                                                             \
361        long_sse += sse2;                                                      \
362        if (w > wf * 2) {                                                      \
363          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
364              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
365              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
366              height, &sse2, NULL, NULL);                                      \
367          se += se2;                                                           \
368          long_sse += sse2;                                                    \
369          se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
370              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
371              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
372              height, &sse2, NULL, NULL);                                      \
373          se += se2;                                                           \
374          long_sse += sse2;                                                    \
375        }                                                                      \
376      }                                                                        \
377    }                                                                          \
378    se = ROUND_POWER_OF_TWO(se, 4);                                            \
379    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
380    *sse_ptr = sse;                                                            \
381    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
382    return (var >= 0) ? (uint32_t)var : 0;                                     \
383  }
384
385#define FNS(opt)                        \
386  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
387  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
388  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
389  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
390  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
391  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
392  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
393  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
394  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
395  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
396  FN(8, 4, 8, 3, 2, opt, (int64_t));
397
398FNS(sse2);
399
400#undef FNS
401#undef FN
402
403// The 2 unused parameters are place holders for PIC enabled build.
404#define DECL(w, opt)                                                         \
405  int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
406      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
407      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
408      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
409      void *unused);
410#define DECLS(opt1) \
411  DECL(16, opt1)    \
412  DECL(8, opt1)
413
414DECLS(sse2);
415#undef DECL
416#undef DECLS
417
418#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
419  uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
420      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
421      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
422      const uint8_t *sec8) {                                                   \
423    uint32_t sse;                                                              \
424    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
425    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
426    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
427    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
428        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
429        NULL, NULL);                                                           \
430    if (w > wf) {                                                              \
431      uint32_t sse2;                                                           \
432      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
433          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
434          sec + 16, w, h, &sse2, NULL, NULL);                                  \
435      se += se2;                                                               \
436      sse += sse2;                                                             \
437      if (w > wf * 2) {                                                        \
438        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
439            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
440            sec + 32, w, h, &sse2, NULL, NULL);                                \
441        se += se2;                                                             \
442        sse += sse2;                                                           \
443        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
444            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
445            sec + 48, w, h, &sse2, NULL, NULL);                                \
446        se += se2;                                                             \
447        sse += sse2;                                                           \
448      }                                                                        \
449    }                                                                          \
450    *sse_ptr = sse;                                                            \
451    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
452  }                                                                            \
453                                                                               \
454  uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
455      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
456      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
457      const uint8_t *sec8) {                                                   \
458    int64_t var;                                                               \
459    uint32_t sse;                                                              \
460    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
461    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
462    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
463    int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
464        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
465        NULL, NULL);                                                           \
466    if (w > wf) {                                                              \
467      uint32_t sse2;                                                           \
468      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
469          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
470          sec + 16, w, h, &sse2, NULL, NULL);                                  \
471      se += se2;                                                               \
472      sse += sse2;                                                             \
473      if (w > wf * 2) {                                                        \
474        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
475            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
476            sec + 32, w, h, &sse2, NULL, NULL);                                \
477        se += se2;                                                             \
478        sse += sse2;                                                           \
479        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
480            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
481            sec + 48, w, h, &sse2, NULL, NULL);                                \
482        se += se2;                                                             \
483        sse += sse2;                                                           \
484      }                                                                        \
485    }                                                                          \
486    se = ROUND_POWER_OF_TWO(se, 2);                                            \
487    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
488    *sse_ptr = sse;                                                            \
489    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
490    return (var >= 0) ? (uint32_t)var : 0;                                     \
491  }                                                                            \
492                                                                               \
493  uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
494      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
495      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
496      const uint8_t *sec8) {                                                   \
497    int start_row;                                                             \
498    int64_t var;                                                               \
499    uint32_t sse;                                                              \
500    int se = 0;                                                                \
501    uint64_t long_sse = 0;                                                     \
502    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
503    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
504    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
505    for (start_row = 0; start_row < h; start_row += 16) {                      \
506      uint32_t sse2;                                                           \
507      int height = h - start_row < 16 ? h - start_row : 16;                    \
508      int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
509          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
510          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
511          w, height, &sse2, NULL, NULL);                                       \
512      se += se2;                                                               \
513      long_sse += sse2;                                                        \
514      if (w > wf) {                                                            \
515        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
516            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
517            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
518            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
519        se += se2;                                                             \
520        long_sse += sse2;                                                      \
521        if (w > wf * 2) {                                                      \
522          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
523              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
524              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
525              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
526          se += se2;                                                           \
527          long_sse += sse2;                                                    \
528          se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
529              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
530              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
531              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
532          se += se2;                                                           \
533          long_sse += sse2;                                                    \
534        }                                                                      \
535      }                                                                        \
536    }                                                                          \
537    se = ROUND_POWER_OF_TWO(se, 4);                                            \
538    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
539    *sse_ptr = sse;                                                            \
540    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
541    return (var >= 0) ? (uint32_t)var : 0;                                     \
542  }
543
544#define FNS(opt1)                        \
545  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
546  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
547  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
548  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
549  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
550  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
551  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
552  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
553  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
554  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
555  FN(8, 4, 8, 3, 2, opt1, (int64_t));
556
557FNS(sse2);
558
559#undef FNS
560#undef FN
561