highbd_variance_sse2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10#include "./vpx_config.h"
11
12#include "vpx_ports/mem.h"
13
14typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
15                                        const uint16_t *ref, int ref_stride,
16                                        uint32_t *sse, int *sum);
17
18uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
19                                    const uint16_t *ref, int ref_stride,
20                                    uint32_t *sse, int *sum);
21
22uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
23                                      const uint16_t *ref, int ref_stride,
24                                      uint32_t *sse, int *sum);
25
26static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
27                                   const uint16_t *ref, int ref_stride,
28                                   int w, int h, uint32_t *sse, int *sum,
29                                   high_variance_fn_t var_fn, int block_size) {
30  int i, j;
31
32  *sse = 0;
33  *sum = 0;
34
35  for (i = 0; i < h; i += block_size) {
36    for (j = 0; j < w; j += block_size) {
37      unsigned int sse0;
38      int sum0;
39      var_fn(src + src_stride * i + j, src_stride,
40             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
41      *sse += sse0;
42      *sum += sum0;
43    }
44  }
45}
46
47static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
48                                    const uint16_t *ref, int ref_stride,
49                                    int w, int h, uint32_t *sse, int *sum,
50                                    high_variance_fn_t var_fn, int block_size) {
51  int i, j;
52  uint64_t sse_long = 0;
53  int32_t sum_long = 0;
54
55  for (i = 0; i < h; i += block_size) {
56    for (j = 0; j < w; j += block_size) {
57      unsigned int sse0;
58      int sum0;
59      var_fn(src + src_stride * i + j, src_stride,
60             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
61      sse_long += sse0;
62      sum_long += sum0;
63    }
64  }
65  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
66  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
67}
68
69static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
70                                    const uint16_t *ref, int ref_stride,
71                                    int w, int h, uint32_t *sse, int *sum,
72                                    high_variance_fn_t var_fn, int block_size) {
73  int i, j;
74  uint64_t sse_long = 0;
75  int32_t sum_long = 0;
76
77  for (i = 0; i < h; i += block_size) {
78    for (j = 0; j < w; j += block_size) {
79      unsigned int sse0;
80      int sum0;
81      var_fn(src + src_stride * i + j, src_stride,
82             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
83      sse_long += sse0;
84      sum_long += sum0;
85    }
86  }
87  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
88  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
89}
90
91
92#define HIGH_GET_VAR(S) \
93void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
94                                       const uint8_t *ref8, int ref_stride, \
95                                       uint32_t *sse, int *sum) { \
96  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
97  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
98  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
99                                     sse, sum); \
100} \
101\
102void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
103                                          const uint8_t *ref8, int ref_stride, \
104                                          uint32_t *sse, int *sum) { \
105  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
106  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
107  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
108                                     sse, sum); \
109  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
110  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
111} \
112\
113void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
114                                          const uint8_t *ref8, int ref_stride, \
115                                          uint32_t *sse, int *sum) { \
116  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
117  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
118  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
119                                     sse, sum); \
120  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
121  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
122}
123
124HIGH_GET_VAR(16);
125HIGH_GET_VAR(8);
126
127#undef HIGH_GET_VAR
128
129#define VAR_FN(w, h, block_size, shift) \
130uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
131    const uint8_t *src8, int src_stride, \
132    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
133  int sum; \
134  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
135  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
136  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
137                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
138                         block_size); \
139  return *sse - (((int64_t)sum * sum) >> shift); \
140} \
141\
142uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
143    const uint8_t *src8, int src_stride, \
144    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
145  int sum; \
146  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
147  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
148  highbd_10_variance_sse2( \
149      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
150      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
151  return *sse - (((int64_t)sum * sum) >> shift); \
152} \
153\
154uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
155    const uint8_t *src8, int src_stride, \
156    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
157  int sum; \
158  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
159  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
160  highbd_12_variance_sse2( \
161      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
162      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
163  return *sse - (((int64_t)sum * sum) >> shift); \
164}
165
166VAR_FN(64, 64, 16, 12);
167VAR_FN(64, 32, 16, 11);
168VAR_FN(32, 64, 16, 11);
169VAR_FN(32, 32, 16, 10);
170VAR_FN(32, 16, 16, 9);
171VAR_FN(16, 32, 16, 9);
172VAR_FN(16, 16, 16, 8);
173VAR_FN(16, 8, 8, 7);
174VAR_FN(8, 16, 8, 7);
175VAR_FN(8, 8, 8, 6);
176
177#undef VAR_FN
178
179unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
180                                      const uint8_t *ref8, int ref_stride,
181                                      unsigned int *sse) {
182  int sum;
183  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
184  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
185  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
186                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
187  return *sse;
188}
189
190unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
191                                         const uint8_t *ref8, int ref_stride,
192                                         unsigned int *sse) {
193  int sum;
194  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
195  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
196  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
197                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
198  return *sse;
199}
200
201unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
202                                         const uint8_t *ref8, int ref_stride,
203                                         unsigned int *sse) {
204  int sum;
205  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
206  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
207  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
208                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
209  return *sse;
210}
211
212unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
213                                    const uint8_t *ref8, int ref_stride,
214                                    unsigned int *sse) {
215  int sum;
216  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
217  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
218  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
219                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
220  return *sse;
221}
222
223unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
224                                       const uint8_t *ref8, int ref_stride,
225                                       unsigned int *sse) {
226  int sum;
227  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
228  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
229  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
230                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
231  return *sse;
232}
233
234unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
235                                       const uint8_t *ref8, int ref_stride,
236                                       unsigned int *sse) {
237  int sum;
238  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
239  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
240  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
241                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
242  return *sse;
243}
244
245#if CONFIG_USE_X86INC
246#define DECL(w, opt) \
247  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
248                                                 ptrdiff_t src_stride, \
249                                                 int x_offset, int y_offset, \
250                                                 const uint16_t *dst, \
251                                                 ptrdiff_t dst_stride, \
252                                                 int height, unsigned int *sse);
253#define DECLS(opt1, opt2) \
254  DECL(8, opt1); \
255  DECL(16, opt1)
256
257DECLS(sse2, sse);
258// TODO(johannkoenig): enable the ssse3 or delete
259// DECLS(ssse3, ssse3);
260#undef DECLS
261#undef DECL
262
263#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
264uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
265                                                          int src_stride, \
266                                                          int x_offset, \
267                                                          int y_offset, \
268                                                          const uint8_t *dst8, \
269                                                          int dst_stride, \
270                                                          uint32_t *sse_ptr) { \
271  uint32_t sse; \
272  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
273  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
274  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
275                                                       x_offset, y_offset, \
276                                                       dst, dst_stride, h, \
277                                                       &sse); \
278  if (w > wf) { \
279    unsigned int sse2; \
280    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
281                                                          src_stride, \
282                                                          x_offset, y_offset, \
283                                                          dst + 16, \
284                                                          dst_stride, \
285                                                          h, &sse2); \
286    se += se2; \
287    sse += sse2; \
288    if (w > wf * 2) { \
289      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
290                                                        x_offset, y_offset, \
291                                                        dst + 32, dst_stride, \
292                                                        h, &sse2); \
293      se += se2; \
294      sse += sse2; \
295      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
296          src + 48, src_stride, x_offset, y_offset, \
297          dst + 48, dst_stride, h, &sse2); \
298      se += se2; \
299      sse += sse2; \
300    } \
301  } \
302  *sse_ptr = sse; \
303  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
304} \
305\
306uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
307    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
308    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
309  uint32_t sse; \
310  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
311  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
312  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
313                                                       x_offset, y_offset, \
314                                                       dst, dst_stride, \
315                                                       h, &sse); \
316  if (w > wf) { \
317    uint32_t sse2; \
318    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
319                                                          src_stride, \
320                                                          x_offset, y_offset, \
321                                                          dst + 16, \
322                                                          dst_stride, \
323                                                          h, &sse2); \
324    se += se2; \
325    sse += sse2; \
326    if (w > wf * 2) { \
327      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
328                                                        x_offset, y_offset, \
329                                                        dst + 32, dst_stride, \
330                                                        h, &sse2); \
331      se += se2; \
332      sse += sse2; \
333      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
334                                                        x_offset, y_offset, \
335                                                        dst + 48, dst_stride, \
336                                                        h, &sse2); \
337      se += se2; \
338      sse += sse2; \
339    } \
340  } \
341  se = ROUND_POWER_OF_TWO(se, 2); \
342  sse = ROUND_POWER_OF_TWO(sse, 4); \
343  *sse_ptr = sse; \
344  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
345} \
346\
347uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
348    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
349    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
350  int start_row; \
351  uint32_t sse; \
352  int se = 0; \
353  uint64_t long_sse = 0; \
354  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
355  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
356  for (start_row = 0; start_row < h; start_row +=16) { \
357    uint32_t sse2; \
358    int height = h - start_row < 16 ? h - start_row : 16; \
359    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
360        src + (start_row * src_stride), src_stride, \
361        x_offset, y_offset, dst + (start_row * dst_stride), \
362        dst_stride, height, &sse2); \
363    se += se2; \
364    long_sse += sse2; \
365    if (w > wf) { \
366      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
367          src + 16 + (start_row * src_stride), src_stride, \
368          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
369          dst_stride, height, &sse2); \
370      se += se2; \
371      long_sse += sse2; \
372      if (w > wf * 2) { \
373        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
374            src + 32 + (start_row * src_stride), src_stride, \
375            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
376            dst_stride, height, &sse2); \
377        se += se2; \
378        long_sse += sse2; \
379        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
380            src + 48 + (start_row * src_stride), src_stride, \
381            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
382            dst_stride, height, &sse2); \
383        se += se2; \
384        long_sse += sse2; \
385      }\
386    } \
387  } \
388  se = ROUND_POWER_OF_TWO(se, 4); \
389  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
390  *sse_ptr = sse; \
391  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
392}
393
394#define FNS(opt1, opt2) \
395FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
396FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
397FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
398FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
399FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
400FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
401FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
402FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
403FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
404FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
405FN(8, 4, 8, 3, 2, opt1, (int64_t));
406
407
408FNS(sse2, sse);
409
410#undef FNS
411#undef FN
412
413#define DECL(w, opt) \
414int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
415                                                   ptrdiff_t src_stride, \
416                                                   int x_offset, int y_offset, \
417                                                   const uint16_t *dst, \
418                                                   ptrdiff_t dst_stride, \
419                                                   const uint16_t *sec, \
420                                                   ptrdiff_t sec_stride, \
421                                                   int height, \
422                                                   unsigned int *sse);
423#define DECLS(opt1) \
424DECL(16, opt1) \
425DECL(8, opt1)
426
427DECLS(sse2);
428#undef DECL
429#undef DECLS
430
431#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
432uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
433    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
434    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
435    const uint8_t *sec8) { \
436  uint32_t sse; \
437  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
438  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
439  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
440  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
441               src, src_stride, x_offset, \
442               y_offset, dst, dst_stride, sec, w, h, &sse); \
443  if (w > wf) { \
444    uint32_t sse2; \
445    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
446                  src + 16, src_stride, x_offset, y_offset, \
447                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
448    se += se2; \
449    sse += sse2; \
450    if (w > wf * 2) { \
451      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
452                src + 32, src_stride, x_offset, y_offset, \
453                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
454      se += se2; \
455      sse += sse2; \
456      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
457                src + 48, src_stride, x_offset, y_offset, \
458                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
459      se += se2; \
460      sse += sse2; \
461    } \
462  } \
463  *sse_ptr = sse; \
464  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
465} \
466\
467uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
468    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
469    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
470    const uint8_t *sec8) { \
471  uint32_t sse; \
472  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
473  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
474  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
475  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
476                                            src, src_stride, x_offset, \
477                                            y_offset, dst, dst_stride, \
478                                            sec, w, h, &sse); \
479  if (w > wf) { \
480    uint32_t sse2; \
481    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
482                                            src + 16, src_stride, \
483                                            x_offset, y_offset, \
484                                            dst + 16, dst_stride, \
485                                            sec + 16, w, h, &sse2); \
486    se += se2; \
487    sse += sse2; \
488    if (w > wf * 2) { \
489      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
490                                            src + 32, src_stride, \
491                                            x_offset, y_offset, \
492                                            dst + 32, dst_stride, \
493                                            sec + 32, w, h, &sse2); \
494      se += se2; \
495      sse += sse2; \
496      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
497                                            src + 48, src_stride, \
498                                            x_offset, y_offset, \
499                                            dst + 48, dst_stride, \
500                                            sec + 48, w, h, &sse2); \
501      se += se2; \
502      sse += sse2; \
503    } \
504  } \
505  se = ROUND_POWER_OF_TWO(se, 2); \
506  sse = ROUND_POWER_OF_TWO(sse, 4); \
507  *sse_ptr = sse; \
508  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
509} \
510\
511uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
512    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
513    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
514    const uint8_t *sec8) { \
515  int start_row; \
516  uint32_t sse; \
517  int se = 0; \
518  uint64_t long_sse = 0; \
519  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
520  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
521  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
522  for (start_row = 0; start_row < h; start_row +=16) { \
523    uint32_t sse2; \
524    int height = h - start_row < 16 ? h - start_row : 16; \
525    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
526                src + (start_row * src_stride), src_stride, x_offset, \
527                y_offset, dst + (start_row * dst_stride), dst_stride, \
528                sec + (start_row * w), w, height, &sse2); \
529    se += se2; \
530    long_sse += sse2; \
531    if (w > wf) { \
532      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
533                src + 16 + (start_row * src_stride), src_stride, \
534                x_offset, y_offset, \
535                dst + 16 + (start_row * dst_stride), dst_stride, \
536                sec + 16 + (start_row * w), w, height, &sse2); \
537      se += se2; \
538      long_sse += sse2; \
539      if (w > wf * 2) { \
540        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
541                src + 32 + (start_row * src_stride), src_stride, \
542                x_offset, y_offset, \
543                dst + 32 + (start_row * dst_stride), dst_stride, \
544                sec + 32 + (start_row * w), w, height, &sse2); \
545        se += se2; \
546        long_sse += sse2; \
547        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
548                src + 48 + (start_row * src_stride), src_stride, \
549                x_offset, y_offset, \
550                dst + 48 + (start_row * dst_stride), dst_stride, \
551                sec + 48 + (start_row * w), w, height, &sse2); \
552        se += se2; \
553        long_sse += sse2; \
554      } \
555    } \
556  } \
557  se = ROUND_POWER_OF_TWO(se, 4); \
558  sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
559  *sse_ptr = sse; \
560  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
561}
562
563
564#define FNS(opt1) \
565FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
566FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
567FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
568FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
569FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
570FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
571FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
572FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
573FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
574FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
575FN(8, 4, 8, 3, 2, opt1, (int64_t));
576
577FNS(sse2);
578
579#undef FNS
580#undef FN
581#endif  // CONFIG_USE_X86INC
582