1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include "./vpx_dsp_rtcd.h" 13#include "./vpx_config.h" 14 15#include "vpx/vpx_integer.h" 16 17#include "vpx_dsp/variance.h" 18#include "vpx_dsp/arm/mem_neon.h" 19 20static const uint8_t bilinear_filters[8][2] = { 21 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, 22 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, 23}; 24 25// Process a block exactly 4 wide and a multiple of 2 high. 26static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, 27 uint8_t *output_ptr, 28 unsigned int src_pixels_per_line, 29 int pixel_step, 30 unsigned int output_height, 31 const uint8_t *filter) { 32 const uint8x8_t f0 = vdup_n_u8(filter[0]); 33 const uint8x8_t f1 = vdup_n_u8(filter[1]); 34 unsigned int i; 35 for (i = 0; i < output_height; i += 2) { 36 const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line); 37 const uint8x8_t src_1 = 38 load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line); 39 const uint16x8_t a = vmull_u8(src_0, f0); 40 const uint16x8_t b = vmlal_u8(a, src_1, f1); 41 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); 42 vst1_u8(output_ptr, out); 43 src_ptr += 2 * src_pixels_per_line; 44 output_ptr += 8; 45 } 46} 47 48// Process a block exactly 8 wide and any height. 49static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, 50 uint8_t *output_ptr, 51 unsigned int src_pixels_per_line, 52 int pixel_step, 53 unsigned int output_height, 54 const uint8_t *filter) { 55 const uint8x8_t f0 = vdup_n_u8(filter[0]); 56 const uint8x8_t f1 = vdup_n_u8(filter[1]); 57 unsigned int i; 58 for (i = 0; i < output_height; ++i) { 59 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); 60 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); 61 const uint16x8_t a = vmull_u8(src_0, f0); 62 const uint16x8_t b = vmlal_u8(a, src_1, f1); 63 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); 64 vst1_u8(output_ptr, out); 65 src_ptr += src_pixels_per_line; 66 output_ptr += 8; 67 } 68} 69 70// Process a block which is a mutiple of 16 wide and any height. 71static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, 72 uint8_t *output_ptr, 73 unsigned int src_pixels_per_line, 74 int pixel_step, 75 unsigned int output_height, 76 unsigned int output_width, 77 const uint8_t *filter) { 78 const uint8x8_t f0 = vdup_n_u8(filter[0]); 79 const uint8x8_t f1 = vdup_n_u8(filter[1]); 80 unsigned int i, j; 81 for (i = 0; i < output_height; ++i) { 82 for (j = 0; j < output_width; j += 16) { 83 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); 84 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); 85 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); 86 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); 87 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); 88 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); 89 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); 90 const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); 91 vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi)); 92 } 93 src_ptr += src_pixels_per_line; 94 output_ptr += output_width; 95 } 96} 97 98// 4xM filter writes an extra row to fdata because it processes two rows at a 99// time. 100#define sub_pixel_varianceNxM(n, m) \ 101 uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \ 102 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 103 const uint8_t *b, int b_stride, uint32_t *sse) { \ 104 uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ 105 uint8_t temp1[n * m]; \ 106 \ 107 if (n == 4) { \ 108 var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ 109 bilinear_filters[xoffset]); \ 110 var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ 111 bilinear_filters[yoffset]); \ 112 } else if (n == 8) { \ 113 var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ 114 bilinear_filters[xoffset]); \ 115 var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ 116 bilinear_filters[yoffset]); \ 117 } else { \ 118 var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ 119 bilinear_filters[xoffset]); \ 120 var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ 121 bilinear_filters[yoffset]); \ 122 } \ 123 return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \ 124 } 125 126sub_pixel_varianceNxM(4, 4); 127sub_pixel_varianceNxM(4, 8); 128sub_pixel_varianceNxM(8, 4); 129sub_pixel_varianceNxM(8, 8); 130sub_pixel_varianceNxM(8, 16); 131sub_pixel_varianceNxM(16, 8); 132sub_pixel_varianceNxM(16, 16); 133sub_pixel_varianceNxM(16, 32); 134sub_pixel_varianceNxM(32, 16); 135sub_pixel_varianceNxM(32, 32); 136sub_pixel_varianceNxM(32, 64); 137sub_pixel_varianceNxM(64, 32); 138sub_pixel_varianceNxM(64, 64); 139 140// 4xM filter writes an extra row to fdata because it processes two rows at a 141// time. 142#define sub_pixel_avg_varianceNxM(n, m) \ 143 uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \ 144 const uint8_t *a, int a_stride, int xoffset, int yoffset, \ 145 const uint8_t *b, int b_stride, uint32_t *sse, \ 146 const uint8_t *second_pred) { \ 147 uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \ 148 uint8_t temp1[n * m]; \ 149 \ 150 if (n == 4) { \ 151 var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \ 152 bilinear_filters[xoffset]); \ 153 var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \ 154 bilinear_filters[yoffset]); \ 155 } else if (n == 8) { \ 156 var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \ 157 bilinear_filters[xoffset]); \ 158 var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \ 159 bilinear_filters[yoffset]); \ 160 } else { \ 161 var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \ 162 bilinear_filters[xoffset]); \ 163 var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \ 164 bilinear_filters[yoffset]); \ 165 } \ 166 \ 167 vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \ 168 \ 169 return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \ 170 } 171 172sub_pixel_avg_varianceNxM(4, 4); 173sub_pixel_avg_varianceNxM(4, 8); 174sub_pixel_avg_varianceNxM(8, 4); 175sub_pixel_avg_varianceNxM(8, 8); 176sub_pixel_avg_varianceNxM(8, 16); 177sub_pixel_avg_varianceNxM(16, 8); 178sub_pixel_avg_varianceNxM(16, 16); 179sub_pixel_avg_varianceNxM(16, 32); 180sub_pixel_avg_varianceNxM(32, 16); 181sub_pixel_avg_varianceNxM(32, 32); 182sub_pixel_avg_varianceNxM(32, 64); 183sub_pixel_avg_varianceNxM(64, 32); 184sub_pixel_avg_varianceNxM(64, 64); 185