1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10#include "./vpx_config.h"
11
12#include "vp9/encoder/vp9_variance.h"
13#include "vp9/common/vp9_pragmas.h"
14#include "vpx_ports/mem.h"
15
16typedef void (*get_var_avx2) (
17  const unsigned char *src_ptr,
18  int source_stride,
19  const unsigned char *ref_ptr,
20  int recon_stride,
21  unsigned int *SSE,
22  int *Sum
23);
24
25void vp9_get16x16var_avx2
26(
27  const unsigned char *src_ptr,
28  int source_stride,
29  const unsigned char *ref_ptr,
30  int recon_stride,
31  unsigned int *SSE,
32  int *Sum
33);
34
35void vp9_get32x32var_avx2
36(
37  const unsigned char *src_ptr,
38  int source_stride,
39  const unsigned char *ref_ptr,
40  int recon_stride,
41  unsigned int *SSE,
42  int *Sum
43);
44
45unsigned int vp9_sub_pixel_variance32xh_avx2
46(
47  const uint8_t *src,
48  int src_stride,
49  int x_offset,
50  int y_offset,
51  const uint8_t *dst,
52  int dst_stride,
53  int height,
54  unsigned int *sse
55);
56
57unsigned int vp9_sub_pixel_avg_variance32xh_avx2
58(
59  const uint8_t *src,
60  int src_stride,
61  int x_offset,
62  int y_offset,
63  const uint8_t *dst,
64  int dst_stride,
65  const uint8_t *sec,
66  int sec_stride,
67  int height,
68  unsigned int *sseptr
69);
70
71static void variance_avx2(const unsigned char *src_ptr, int  source_stride,
72                        const unsigned char *ref_ptr, int  recon_stride,
73                        int  w, int  h, unsigned int *sse, int *sum,
74                        get_var_avx2 var_fn, int block_size) {
75  unsigned int sse0;
76  int sum0;
77  int i, j;
78
79  *sse = 0;
80  *sum = 0;
81
82  for (i = 0; i < h; i += 16) {
83    for (j = 0; j < w; j += block_size) {
84      // processing 16 rows horizontally each call
85      var_fn(src_ptr + source_stride * i + j, source_stride,
86             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
87      *sse += sse0;
88      *sum += sum0;
89    }
90  }
91}
92
93unsigned int vp9_variance16x16_avx2
94(
95  const unsigned char *src_ptr,
96  int  source_stride,
97  const unsigned char *ref_ptr,
98  int  recon_stride,
99  unsigned int *sse) {
100  unsigned int var;
101  int avg;
102
103  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
104                &var, &avg, vp9_get16x16var_avx2, 16);
105  *sse = var;
106  return (var - (((unsigned int)avg * avg) >> 8));
107}
108
109unsigned int vp9_mse16x16_avx2(
110  const unsigned char *src_ptr,
111  int  source_stride,
112  const unsigned char *ref_ptr,
113  int  recon_stride,
114  unsigned int *sse) {
115  unsigned int sse0;
116  int sum0;
117  vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
118                       &sum0);
119  *sse = sse0;
120  return sse0;
121}
122
123unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
124                                    int  source_stride,
125                                    const uint8_t *ref_ptr,
126                                    int  recon_stride,
127                                    unsigned int *sse) {
128  unsigned int var;
129  int avg;
130
131  // processing 32 elements vertically in parallel
132  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
133                &var, &avg, vp9_get32x32var_avx2, 32);
134  *sse = var;
135  return (var - (((int64_t)avg * avg) >> 10));
136}
137
138unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
139                                    int  source_stride,
140                                    const uint8_t *ref_ptr,
141                                    int  recon_stride,
142                                    unsigned int *sse) {
143  unsigned int var;
144  int avg;
145
146  // processing 32 elements vertically in parallel
147  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
148                &var, &avg, vp9_get32x32var_avx2, 32);
149  *sse = var;
150  return (var - (((int64_t)avg * avg) >> 9));
151}
152
153
154unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
155                                    int  source_stride,
156                                    const uint8_t *ref_ptr,
157                                    int  recon_stride,
158                                    unsigned int *sse) {
159  unsigned int var;
160  int avg;
161
162  // processing 32 elements vertically in parallel
163  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
164                &var, &avg, vp9_get32x32var_avx2, 32);
165  *sse = var;
166  return (var - (((int64_t)avg * avg) >> 12));
167}
168
169unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
170                                    int  source_stride,
171                                    const uint8_t *ref_ptr,
172                                    int  recon_stride,
173                                    unsigned int *sse) {
174  unsigned int var;
175  int avg;
176
177  // processing 32 elements vertically in parallel
178  variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
179                &var, &avg, vp9_get32x32var_avx2, 32);
180
181  *sse = var;
182  return (var - (((int64_t)avg * avg) >> 11));
183}
184
185unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
186                                              int src_stride,
187                                              int x_offset,
188                                              int y_offset,
189                                              const uint8_t *dst,
190                                              int dst_stride,
191                                              unsigned int *sse_ptr) {
192  // processing 32 elements in parallel
193  unsigned int sse;
194  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
195                                           y_offset, dst, dst_stride,
196                                           64, &sse);
197  // processing the next 32 elements in parallel
198  unsigned int sse2;
199  int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
200                                            x_offset, y_offset,
201                                            dst + 32, dst_stride,
202                                            64, &sse2);
203  se += se2;
204  sse += sse2;
205  *sse_ptr = sse;
206  return sse - (((int64_t)se * se) >> 12);
207}
208
209unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
210                                              int src_stride,
211                                              int x_offset,
212                                              int y_offset,
213                                              const uint8_t *dst,
214                                              int dst_stride,
215                                              unsigned int *sse_ptr) {
216  // processing 32 element in parallel
217  unsigned int sse;
218  int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
219                                           y_offset, dst, dst_stride,
220                                           32, &sse);
221  *sse_ptr = sse;
222  return sse - (((int64_t)se * se) >> 10);
223}
224
225unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
226                                                  int src_stride,
227                                                  int x_offset,
228                                                  int y_offset,
229                                                  const uint8_t *dst,
230                                                  int dst_stride,
231                                                  unsigned int *sseptr,
232                                                  const uint8_t *sec) {
233  // processing 32 elements in parallel
234  unsigned int sse;
235
236  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
237                                               y_offset, dst, dst_stride,
238                                               sec, 64, 64, &sse);
239  unsigned int sse2;
240  // processing the next 32 elements in parallel
241  int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
242                                                y_offset, dst + 32, dst_stride,
243                                                sec + 32, 64, 64, &sse2);
244  se += se2;
245  sse += sse2;
246  *sseptr = sse;
247
248  return sse - (((int64_t)se * se) >> 12);
249}
250
251unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
252                                                  int src_stride,
253                                                  int x_offset,
254                                                  int y_offset,
255                                                  const uint8_t *dst,
256                                                  int dst_stride,
257                                                  unsigned int *sseptr,
258                                                  const uint8_t *sec) {
259  // processing 32 element in parallel
260  unsigned int sse;
261  int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
262                                                 y_offset, dst, dst_stride,
263                                                 sec, 32, 32, &sse);
264  *sseptr = sse;
265  return sse - (((int64_t)se * se) >> 10);
266}
267
268
269