1dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org/*
2dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *
4dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  Use of this source code is governed by a BSD-style license
5dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  that can be found in the LICENSE file in the root of the source
6dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  tree. An additional intellectual property rights grant can be found
7dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  in the file PATENTS.  All contributing project authors may
8dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org *  be found in the AUTHORS file in the root of the source tree.
9dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org */
10dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "./vpx_config.h"
11dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
12dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "vp9/encoder/vp9_variance.h"
13dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org#include "vpx_ports/mem.h"
14dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
15d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgtypedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             const uint8_t *ref, int ref_stride,
17d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                             unsigned int *sse, int *sum);
18d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
19d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgvoid vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
20d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          const uint8_t *ref, int ref_stride,
21d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          unsigned int *sse, int *sum);
22d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
23d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgvoid vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
24d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          const uint8_t *ref, int ref_stride,
25d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          unsigned int *sse, int *sum);
26d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
27d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
28d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                             int x_offset, int y_offset,
29d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                             const uint8_t *dst, int dst_stride,
30d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                             int height,
31d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                             unsigned int *sse);
32d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
33d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
34d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int src_stride,
35d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int x_offset,
36d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int y_offset,
37d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 const uint8_t *dst,
38d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int dst_stride,
39d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 const uint8_t *sec,
40d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int sec_stride,
41d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 int height,
42d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 unsigned int *sseptr);
43d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
44d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgstatic void variance_avx2(const uint8_t *src, int src_stride,
45d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          const uint8_t *ref, int  ref_stride,
46d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          int w, int h, unsigned int *sse, int *sum,
47d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                          get_var_avx2 var_fn, int block_size) {
48dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  int i, j;
49dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
50dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  *sse = 0;
51dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  *sum = 0;
52dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
53dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  for (i = 0; i < h; i += 16) {
54dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org    for (j = 0; j < w; j += block_size) {
55d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      unsigned int sse0;
56d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      int sum0;
57d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      var_fn(&src[src_stride * i + j], src_stride,
58d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
59dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      *sse += sse0;
60dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org      *sum += sum0;
61dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org    }
62dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org  }
63dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
64dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
65dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
66d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
67d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    const uint8_t *ref, int ref_stride,
68d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    unsigned int *sse) {
69d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
70d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
71d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get16x16var_avx2, 16);
72d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((unsigned int)sum * sum) >> 8);
73dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
74dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
75d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
76d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               const uint8_t *ref, int ref_stride,
77d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                               unsigned int *sse) {
78d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
79d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
80d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse;
81dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
82dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
83d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
84d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    const uint8_t *ref, int ref_stride,
85dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org                                    unsigned int *sse) {
86d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
87d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
88d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get32x32var_avx2, 32);
89d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)sum * sum) >> 9);
90dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
91dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
92d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
93d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    const uint8_t *ref, int ref_stride,
94dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org                                    unsigned int *sse) {
95d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
96d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
97d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get32x32var_avx2, 32);
98d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)sum * sum) >> 10);
99dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
100dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
101d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
102d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    const uint8_t *ref, int ref_stride,
103dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org                                    unsigned int *sse) {
104d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
105d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
106d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get32x32var_avx2, 32);
107d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)sum * sum) >> 12);
108dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
109dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org
110d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.orgunsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
111d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                    const uint8_t *ref, int ref_stride,
112dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org                                    unsigned int *sse) {
113d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  int sum;
114d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
115d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                sse, &sum, vp9_get32x32var_avx2, 32);
116d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)sum * sum) >> 11);
117dddee1ec7cedf276305b107429f684539b105276johannkoenig@chromium.org}
118411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
119411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
120411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int src_stride,
121411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int x_offset,
122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int y_offset,
123411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              const uint8_t *dst,
124411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int dst_stride,
125d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                              unsigned int *sse) {
126d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  unsigned int sse1;
127d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
128d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  y_offset, dst, dst_stride,
129d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  64, &sse1);
130411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int sse2;
131d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
132d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  x_offset, y_offset,
133d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  dst + 32, dst_stride,
134d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  64, &sse2);
135d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se = se1 + se2;
136d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = sse1 + sse2;
137d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)se * se) >> 12);
138411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
139411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
140411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
141411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int src_stride,
142411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int x_offset,
143411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int y_offset,
144411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              const uint8_t *dst,
145411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                              int dst_stride,
146d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                              unsigned int *sse) {
147d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
148d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 y_offset, dst, dst_stride,
149d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                 32, sse);
150d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)se * se) >> 10);
151411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
152411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
153411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
154411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int src_stride,
155411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int x_offset,
156411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int y_offset,
157411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  const uint8_t *dst,
158411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int dst_stride,
159d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  unsigned int *sse,
160411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  const uint8_t *sec) {
161d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  unsigned int sse1;
162d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
163d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                      y_offset, dst, dst_stride,
164d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                      sec, 64, 64, &sse1);
165411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  unsigned int sse2;
166d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se2 =
167d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org      vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
168d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                          y_offset, dst + 32, dst_stride,
169d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                          sec + 32, 64, 64, &sse2);
170d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se = se1 + se2;
171411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
172d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  *sse = sse1 + sse2;
173d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org
174d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)se * se) >> 12);
175411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
176411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
177411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgunsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
178411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int src_stride,
179411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int x_offset,
180411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int y_offset,
181411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  const uint8_t *dst,
182411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  int dst_stride,
183d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                  unsigned int *sse,
184411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                                  const uint8_t *sec) {
185411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org  // processing 32 element in parallel
186d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
187d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                     y_offset, dst, dst_stride,
188d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org                                                     sec, 32, 32, sse);
189d95585fb0ec024f6abd96f7b02e0df58019d46afjohannkoenig@chromium.org  return *sse - (((int64_t)se * se) >> 10);
190411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
191