variance_avx2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10#include "./vpx_dsp_rtcd.h"
11
12typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
13                             const uint8_t *ref, int ref_stride,
14                             unsigned int *sse, int *sum);
15
16void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
17                          const uint8_t *ref, int ref_stride,
18                          unsigned int *sse, int *sum);
19
20static void variance_avx2(const uint8_t *src, int src_stride,
21                          const uint8_t *ref, int  ref_stride,
22                          int w, int h, unsigned int *sse, int *sum,
23                          get_var_avx2 var_fn, int block_size) {
24  int i, j;
25
26  *sse = 0;
27  *sum = 0;
28
29  for (i = 0; i < h; i += 16) {
30    for (j = 0; j < w; j += block_size) {
31      unsigned int sse0;
32      int sum0;
33      var_fn(&src[src_stride * i + j], src_stride,
34             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
35      *sse += sse0;
36      *sum += sum0;
37    }
38  }
39}
40
41
42unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
43                                    const uint8_t *ref, int ref_stride,
44                                    unsigned int *sse) {
45  int sum;
46  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
47                sse, &sum, vpx_get16x16var_avx2, 16);
48  return *sse - (((unsigned int)sum * sum) >> 8);
49}
50
51unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
52                               const uint8_t *ref, int ref_stride,
53                               unsigned int *sse) {
54  int sum;
55  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
56  return *sse;
57}
58
59unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
60                                    const uint8_t *ref, int ref_stride,
61                                    unsigned int *sse) {
62  int sum;
63  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
64                sse, &sum, vpx_get32x32var_avx2, 32);
65  return *sse - (((int64_t)sum * sum) >> 9);
66}
67
68unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
69                                    const uint8_t *ref, int ref_stride,
70                                    unsigned int *sse) {
71  int sum;
72  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
73                sse, &sum, vpx_get32x32var_avx2, 32);
74  return *sse - (((int64_t)sum * sum) >> 10);
75}
76
77unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
78                                    const uint8_t *ref, int ref_stride,
79                                    unsigned int *sse) {
80  int sum;
81  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
82                sse, &sum, vpx_get32x32var_avx2, 32);
83  return *sse - (((int64_t)sum * sum) >> 12);
84}
85
86unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
87                                    const uint8_t *ref, int ref_stride,
88                                    unsigned int *sse) {
89  int sum;
90  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
91                sse, &sum, vpx_get32x32var_avx2, 32);
92  return *sse - (((int64_t)sum * sum) >> 11);
93}
94
95unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
96                                             int x_offset, int y_offset,
97                                             const uint8_t *dst, int dst_stride,
98                                             int height,
99                                             unsigned int *sse);
100
101unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
102                                                 int src_stride,
103                                                 int x_offset,
104                                                 int y_offset,
105                                                 const uint8_t *dst,
106                                                 int dst_stride,
107                                                 const uint8_t *sec,
108                                                 int sec_stride,
109                                                 int height,
110                                                 unsigned int *sseptr);
111
112unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
113                                              int src_stride,
114                                              int x_offset,
115                                              int y_offset,
116                                              const uint8_t *dst,
117                                              int dst_stride,
118                                              unsigned int *sse) {
119  unsigned int sse1;
120  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
121                                                  y_offset, dst, dst_stride,
122                                                  64, &sse1);
123  unsigned int sse2;
124  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
125                                                  x_offset, y_offset,
126                                                  dst + 32, dst_stride,
127                                                  64, &sse2);
128  const int se = se1 + se2;
129  *sse = sse1 + sse2;
130  return *sse - (((int64_t)se * se) >> 12);
131}
132
133unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
134                                              int src_stride,
135                                              int x_offset,
136                                              int y_offset,
137                                              const uint8_t *dst,
138                                              int dst_stride,
139                                              unsigned int *sse) {
140  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
141                                                 y_offset, dst, dst_stride,
142                                                 32, sse);
143  return *sse - (((int64_t)se * se) >> 10);
144}
145
146unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
147                                                  int src_stride,
148                                                  int x_offset,
149                                                  int y_offset,
150                                                  const uint8_t *dst,
151                                                  int dst_stride,
152                                                  unsigned int *sse,
153                                                  const uint8_t *sec) {
154  unsigned int sse1;
155  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
156                                                      y_offset, dst, dst_stride,
157                                                      sec, 64, 64, &sse1);
158  unsigned int sse2;
159  const int se2 =
160    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
161                                        y_offset, dst + 32, dst_stride,
162                                        sec + 32, 64, 64, &sse2);
163  const int se = se1 + se2;
164
165  *sse = sse1 + sse2;
166
167  return *sse - (((int64_t)se * se) >> 12);
168}
169
170unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
171                                                  int src_stride,
172                                                  int x_offset,
173                                                  int y_offset,
174                                                  const uint8_t *dst,
175                                                  int dst_stride,
176                                                  unsigned int *sse,
177                                                  const uint8_t *sec) {
178  // Process 32 elements in parallel.
179  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
180                                                     y_offset, dst, dst_stride,
181                                                     sec, 32, 32, sse);
182  return *sse - (((int64_t)se * se) >> 10);
183}
184