1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include "./vpx_dsp_rtcd.h"
13#include "./vpx_config.h"
14
15#include "vpx/vpx_integer.h"
16
17#include "vpx_dsp/variance.h"
18#include "vpx_dsp/arm/mem_neon.h"
19
20static const uint8_t bilinear_filters[8][2] = {
21  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
22  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
23};
24
25// Process a block exactly 4 wide and a multiple of 2 high.
26static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
27                                      uint8_t *output_ptr,
28                                      unsigned int src_pixels_per_line,
29                                      int pixel_step,
30                                      unsigned int output_height,
31                                      const uint8_t *filter) {
32  const uint8x8_t f0 = vdup_n_u8(filter[0]);
33  const uint8x8_t f1 = vdup_n_u8(filter[1]);
34  unsigned int i;
35  for (i = 0; i < output_height; i += 2) {
36    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
37    const uint8x8_t src_1 =
38        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
39    const uint16x8_t a = vmull_u8(src_0, f0);
40    const uint16x8_t b = vmlal_u8(a, src_1, f1);
41    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
42    vst1_u8(output_ptr, out);
43    src_ptr += 2 * src_pixels_per_line;
44    output_ptr += 8;
45  }
46}
47
48// Process a block exactly 8 wide and any height.
49static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
50                                      uint8_t *output_ptr,
51                                      unsigned int src_pixels_per_line,
52                                      int pixel_step,
53                                      unsigned int output_height,
54                                      const uint8_t *filter) {
55  const uint8x8_t f0 = vdup_n_u8(filter[0]);
56  const uint8x8_t f1 = vdup_n_u8(filter[1]);
57  unsigned int i;
58  for (i = 0; i < output_height; ++i) {
59    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
60    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
61    const uint16x8_t a = vmull_u8(src_0, f0);
62    const uint16x8_t b = vmlal_u8(a, src_1, f1);
63    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
64    vst1_u8(output_ptr, out);
65    src_ptr += src_pixels_per_line;
66    output_ptr += 8;
67  }
68}
69
70// Process a block which is a mutiple of 16 wide and any height.
71static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
72                                       uint8_t *output_ptr,
73                                       unsigned int src_pixels_per_line,
74                                       int pixel_step,
75                                       unsigned int output_height,
76                                       unsigned int output_width,
77                                       const uint8_t *filter) {
78  const uint8x8_t f0 = vdup_n_u8(filter[0]);
79  const uint8x8_t f1 = vdup_n_u8(filter[1]);
80  unsigned int i, j;
81  for (i = 0; i < output_height; ++i) {
82    for (j = 0; j < output_width; j += 16) {
83      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
84      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
85      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
86      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
87      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
88      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
89      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
90      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
91      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
92    }
93    src_ptr += src_pixels_per_line;
94    output_ptr += output_width;
95  }
96}
97
98// 4xM filter writes an extra row to fdata because it processes two rows at a
99// time.
100#define sub_pixel_varianceNxM(n, m)                                 \
101  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
102      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
103      const uint8_t *b, int b_stride, uint32_t *sse) {              \
104    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
105    uint8_t temp1[n * m];                                           \
106                                                                    \
107    if (n == 4) {                                                   \
108      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
109                                bilinear_filters[xoffset]);         \
110      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
111                                bilinear_filters[yoffset]);         \
112    } else if (n == 8) {                                            \
113      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
114                                bilinear_filters[xoffset]);         \
115      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
116                                bilinear_filters[yoffset]);         \
117    } else {                                                        \
118      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
119                                 bilinear_filters[xoffset]);        \
120      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
121                                 bilinear_filters[yoffset]);        \
122    }                                                               \
123    return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
124  }
125
126sub_pixel_varianceNxM(4, 4);
127sub_pixel_varianceNxM(4, 8);
128sub_pixel_varianceNxM(8, 4);
129sub_pixel_varianceNxM(8, 8);
130sub_pixel_varianceNxM(8, 16);
131sub_pixel_varianceNxM(16, 8);
132sub_pixel_varianceNxM(16, 16);
133sub_pixel_varianceNxM(16, 32);
134sub_pixel_varianceNxM(32, 16);
135sub_pixel_varianceNxM(32, 32);
136sub_pixel_varianceNxM(32, 64);
137sub_pixel_varianceNxM(64, 32);
138sub_pixel_varianceNxM(64, 64);
139
140// 4xM filter writes an extra row to fdata because it processes two rows at a
141// time.
142#define sub_pixel_avg_varianceNxM(n, m)                             \
143  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
144      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
145      const uint8_t *b, int b_stride, uint32_t *sse,                \
146      const uint8_t *second_pred) {                                 \
147    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
148    uint8_t temp1[n * m];                                           \
149                                                                    \
150    if (n == 4) {                                                   \
151      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
152                                bilinear_filters[xoffset]);         \
153      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
154                                bilinear_filters[yoffset]);         \
155    } else if (n == 8) {                                            \
156      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
157                                bilinear_filters[xoffset]);         \
158      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
159                                bilinear_filters[yoffset]);         \
160    } else {                                                        \
161      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
162                                 bilinear_filters[xoffset]);        \
163      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
164                                 bilinear_filters[yoffset]);        \
165    }                                                               \
166                                                                    \
167    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
168                                                                    \
169    return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
170  }
171
172sub_pixel_avg_varianceNxM(4, 4);
173sub_pixel_avg_varianceNxM(4, 8);
174sub_pixel_avg_varianceNxM(8, 4);
175sub_pixel_avg_varianceNxM(8, 8);
176sub_pixel_avg_varianceNxM(8, 16);
177sub_pixel_avg_varianceNxM(16, 8);
178sub_pixel_avg_varianceNxM(16, 16);
179sub_pixel_avg_varianceNxM(16, 32);
180sub_pixel_avg_varianceNxM(32, 16);
181sub_pixel_avg_varianceNxM(32, 32);
182sub_pixel_avg_varianceNxM(32, 64);
183sub_pixel_avg_varianceNxM(64, 32);
184sub_pixel_avg_varianceNxM(64, 64);
185