16fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org/*
26fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
36fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *
46fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  Use of this source code is governed by a BSD-style license
56fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  that can be found in the LICENSE file in the root of the source
66fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  tree. An additional intellectual property rights grant can be found
76fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  in the file PATENTS.  All contributing project authors may
86fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  be found in the AUTHORS file in the root of the source tree.
96fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org */
106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#include <emmintrin.h>  // SSE2
12d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
13ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.org#include "./vpx_config.h"
14d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org#include "vp9/encoder/vp9_variance.h"
166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org#include "vpx_ports/mem.h"
176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgtypedef unsigned int (*variance_fn_t) (const unsigned char *src, int src_stride,
1988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                       const unsigned char *ref, int ref_stride,
2088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                       unsigned int *sse, int *sum);
216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
22d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
23d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsum = _mm_setzero_si128();
24d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int i;
256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
26d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  for (i = 0; i < 32; ++i) {
27d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i v = _mm_loadu_si128((const __m128i *)src);
28d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
29d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    src += 8;
30d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
3110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
32d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
33d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
34d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return  _mm_cvtsi128_si32(vsum);
35d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
36d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
37d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org#define READ64(p, stride, i) \
38d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
39d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
40d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
41d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_get4x4var_sse2(const uint8_t *src, int src_stride,
42d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                const uint8_t *ref, int ref_stride,
43d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                unsigned int *sse, int *sum) {
44d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i zero = _mm_setzero_si128();
45d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
46d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
47d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
48d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
49d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
50d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
51d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
52d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sum
53d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsum = _mm_add_epi16(diff0, diff1);
54d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
55d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
56d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
57d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
58d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
59d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sse
60d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
61d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                       _mm_madd_epi16(diff1, diff1));
62d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
63d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
64d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = _mm_cvtsi128_si32(vsum);
65d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
66d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return 0;
67d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
68d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
69d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
70d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                const uint8_t *ref, int ref_stride,
71d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                unsigned int *sse, int *sum) {
72d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i zero = _mm_setzero_si128();
73d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsum = _mm_setzero_si128();
74d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsse = _mm_setzero_si128();
75d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int i;
76d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
77d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  for (i = 0; i < 8; i += 2) {
78d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
79d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        (const __m128i *)(src + i * src_stride)), zero);
80d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
81d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        (const __m128i *)(ref + i * ref_stride)), zero);
82d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
83d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
84d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
85d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        (const __m128i *)(src + (i + 1) * src_stride)), zero);
86d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
87d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
88d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
89d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
90d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsum = _mm_add_epi16(vsum, diff0);
91d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsum = _mm_add_epi16(vsum, diff1);
92d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
93d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
94d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
95d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
96d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sum
97d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
98d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
99d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
100d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
101d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
102d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sse
103d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
104d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
105d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = _mm_cvtsi128_si32(vsse);
106d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
107d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return 0;
108d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
109d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
110d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
111d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                  const uint8_t *ref, int ref_stride,
112d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                  unsigned int *sse, int *sum) {
113d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const __m128i zero = _mm_setzero_si128();
114d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsum = _mm_setzero_si128();
115d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  __m128i vsse = _mm_setzero_si128();
116d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int i;
117d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
118d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  for (i = 0; i < 16; ++i) {
119d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i s = _mm_loadu_si128((const __m128i *)src);
120d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
121d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
122d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
123d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
124d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
125d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
126d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
127d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
128d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
129d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
130d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsum = _mm_add_epi16(vsum, diff0);
131d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsum = _mm_add_epi16(vsum, diff1);
132d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
133d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
134d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
135d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    src += src_stride;
136d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org    ref += ref_stride;
137d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  }
138d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
139d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sum
140d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
141d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
142d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
143d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org             (int16_t)_mm_extract_epi16(vsum, 1);
144d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
145d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  // sse
146d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
147d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
148d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = _mm_cvtsi128_si32(vsse);
149d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
150d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return 0;
151d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
15288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
15388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org
15488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgstatic void variance_sse2(const unsigned char *src, int src_stride,
15588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                          const unsigned char *ref, int ref_stride,
15688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                          int w, int h, unsigned int *sse, int *sum,
15788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                          variance_fn_t var_fn, int block_size) {
15810a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org  int i, j;
15910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
16010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org  *sse = 0;
16110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org  *sum = 0;
16210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
16310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org  for (i = 0; i < h; i += block_size) {
16410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org    for (j = 0; j < w; j += block_size) {
16588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org      unsigned int sse0;
16688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org      int sum0;
16788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org      var_fn(src + src_stride * i + j, src_stride,
16888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
16910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org      *sse += sse0;
17010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org      *sum += sum0;
17110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org    }
17210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org  }
17310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
17410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
17588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
17688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  const unsigned char *ref, int ref_stride,
17788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  unsigned int *sse) {
17888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
179d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
18088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 4);
18110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
18210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
18388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
18488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  const uint8_t *ref, int ref_stride,
18510a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                  unsigned int *sse) {
18688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
18788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
188d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get4x4var_sse2, 4);
18988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 5);
19010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
19110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
19288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
19388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  const uint8_t *ref, int ref_stride,
19410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                  unsigned int *sse) {
19588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
19688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
197d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get4x4var_sse2, 4);
19888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 5);
1996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
2006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
20188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
20288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  const unsigned char *ref, int ref_stride,
20388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                  unsigned int *sse) {
20488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
205d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
20688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 6);
2076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
2086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
20988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
21088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                   const unsigned char *ref, int ref_stride,
21188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                   unsigned int *sse) {
21288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
21388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
21488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get8x8var_sse2, 8);
21588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 7);
2166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
2173f0af3b06425f635f3559f0bd4f53efea95fa5e2johannkoenig@chromium.org
21888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
21988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                   const unsigned char *ref, int ref_stride,
22088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                   unsigned int *sse) {
22188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
22288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
22388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get8x8var_sse2, 8);
22488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((unsigned int)sum * sum) >> 7);
2256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
2266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
22788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
22888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const unsigned char *ref, int ref_stride,
22988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    unsigned int *sse) {
23088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
23188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
232d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((unsigned int)sum * sum) >> 8);
23310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
23410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
23588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
23688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
23710a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
23888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
23988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
24088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
24188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 10);
24210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
24310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
24488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
24588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
24610a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
24788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
24888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
24988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
25088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 9);
25110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
25210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
25388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
25488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
25510a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
25688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
25788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
25888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
25988b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 9);
26010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
26110a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
26288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
26388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
26410a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
26588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
26688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
26788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
26888b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 12);
26910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
27010a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
27188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
27288b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
27310a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
27488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
27588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
27688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
27788b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 11);
27810a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org}
27910a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org
28088b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.orgunsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
28188b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                                    const uint8_t *ref, int ref_stride,
28210a9a0d835561a7f2300c561c514efcf374554d6fgalligan@chromium.org                                    unsigned int *sse) {
28388b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  int sum;
28488b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
28588b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org                sse, &sum, vp9_get16x16var_sse2, 16);
28688b47b29cc274dd19cddc37c1ce1834d97df282efgalligan@chromium.org  return *sse - (((int64_t)sum * sum) >> 11);
2876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
2886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
289d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
290d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             const uint8_t *ref, int ref_stride,
291d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             unsigned int *sse) {
292d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
293d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse;
294d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
295d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
296d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
297d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              const uint8_t *ref, int ref_stride,
298d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              unsigned int *sse) {
299d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
300d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse;
301d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
302d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
303d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
304d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              const uint8_t *ref, int ref_stride,
305d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                              unsigned int *sse) {
306d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
307d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse;
308d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
309d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
310d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
311d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               const uint8_t *ref, int ref_stride,
312d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               unsigned int *sse) {
313d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
314d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse;
315d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org}
316d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
317d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define DECL(w, opt) \
318d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgint vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
319d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                        ptrdiff_t src_stride, \
320d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                        int x_offset, int y_offset, \
321d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                        const uint8_t *dst, \
322d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                        ptrdiff_t dst_stride, \
323d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                        int height, unsigned int *sse)
324d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define DECLS(opt1, opt2) \
325d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(4, opt2); \
326d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(8, opt1); \
327d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(16, opt1)
328d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
329d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECLS(sse2, sse);
330d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECLS(ssse3, ssse3);
331d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef DECLS
332d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef DECL
333d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
334d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
335d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgunsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
336d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     int src_stride, \
337d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     int x_offset, \
338d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     int y_offset, \
339d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     const uint8_t *dst, \
340d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     int dst_stride, \
341d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     unsigned int *sse_ptr) { \
342d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  unsigned int sse; \
343d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
344d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                y_offset, dst, dst_stride, \
345d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                h, &sse); \
346d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  if (w > wf) { \
347d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    unsigned int sse2; \
348d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
349d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                   x_offset, y_offset, \
350d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                   dst + 16, dst_stride, \
351d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                   h, &sse2); \
352d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    se += se2; \
353d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    sse += sse2; \
354d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    if (w > wf * 2) { \
355d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
356d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 x_offset, y_offset, \
357d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 dst + 32, dst_stride, \
358d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 h, &sse2); \
359d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se += se2; \
360d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      sse += sse2; \
361d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
362d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 x_offset, y_offset, \
363d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 dst + 48, dst_stride, \
364d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                 h, &sse2); \
365d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se += se2; \
366d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      sse += sse2; \
367d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    } \
368d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  } \
369d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  *sse_ptr = sse; \
370d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
3716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
3726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
373d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define FNS(opt1, opt2) \
374d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(64, 64, 16, 6, 6, opt1, (int64_t)); \
375d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(64, 32, 16, 6, 5, opt1, (int64_t)); \
376d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 64, 16, 5, 6, opt1, (int64_t)); \
377d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 32, 16, 5, 5, opt1, (int64_t)); \
378d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 16, 16, 5, 4, opt1, (int64_t)); \
379d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(16, 32, 16, 4, 5, opt1, (int64_t)); \
380d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
381ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
382ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
383ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
384ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
385ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
386ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(4,   4,  4, 2, 2, opt2, (unsigned int))
387d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
388d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFNS(sse2, sse);
389d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFNS(ssse3, ssse3);
390d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
391d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef FNS
392d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef FN
393d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
394d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define DECL(w, opt) \
395d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgint vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
396d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            ptrdiff_t src_stride, \
397d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            int x_offset, int y_offset, \
398d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            const uint8_t *dst, \
399d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            ptrdiff_t dst_stride, \
400d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            const uint8_t *sec, \
401d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            ptrdiff_t sec_stride, \
402d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                            int height, unsigned int *sse)
403d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define DECLS(opt1, opt2) \
404d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(4, opt2); \
405d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(8, opt1); \
406d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECL(16, opt1)
407d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
408d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECLS(sse2, sse);
409d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgDECLS(ssse3, ssse3);
410d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef DECL
411d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef DECLS
412d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
413d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
414d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgunsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
415d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         int src_stride, \
416d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         int x_offset, \
417d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         int y_offset, \
418d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         const uint8_t *dst, \
419d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         int dst_stride, \
420d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         unsigned int *sseptr, \
421d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                         const uint8_t *sec) { \
422d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  unsigned int sse; \
423d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
424d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                    y_offset, dst, dst_stride, \
425d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                    sec, w, h, &sse); \
426d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  if (w > wf) { \
427d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    unsigned int sse2; \
428d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
429d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                       x_offset, y_offset, \
430d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                       dst + 16, dst_stride, \
431d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                       sec + 16, w, h, &sse2); \
432d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    se += se2; \
433d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    sse += sse2; \
434d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    if (w > wf * 2) { \
435d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
436d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     x_offset, y_offset, \
437d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     dst + 32, dst_stride, \
438d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     sec + 32, w, h, &sse2); \
439d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se += se2; \
440d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      sse += sse2; \
441d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
442d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     x_offset, y_offset, \
443d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     dst + 48, dst_stride, \
444d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org                                                     sec + 48, w, h, &sse2); \
445d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      se += se2; \
446d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org      sse += sse2; \
447d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org    } \
448d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  } \
449d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  *sseptr = sse; \
450d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
4516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
4526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
453d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#define FNS(opt1, opt2) \
454d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(64, 64, 16, 6, 6, opt1, (int64_t)); \
455d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(64, 32, 16, 6, 5, opt1, (int64_t)); \
456d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 64, 16, 5, 6, opt1, (int64_t)); \
457d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 32, 16, 5, 5, opt1, (int64_t)); \
458d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(32, 16, 16, 5, 4, opt1, (int64_t)); \
459d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(16, 32, 16, 4, 5, opt1, (int64_t)); \
460d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
461ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
462ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
463ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
464ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
465ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
466ecee051929d6ced19cf324688774acccc9ad4a0ajohannkoenig@chromium.orgFN(4,   4,  4, 2, 2, opt2, (unsigned int))
467d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
468d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFNS(sse2, sse);
469d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.orgFNS(ssse3, ssse3);
470d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org
471d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef FNS
472d0351deb5037171ecec154298d37e3a74d992b0dfgalligan@chromium.org#undef FN
473