1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12#include "./vpx_dsp_rtcd.h" 13#include "./vpx_config.h" 14 15#include "vpx_ports/mem.h" 16#include "vpx/vpx_integer.h" 17 18#include "vpx_dsp/variance.h" 19 20static const uint8_t bilinear_filters[8][2] = { 21 { 128, 0, }, 22 { 112, 16, }, 23 { 96, 32, }, 24 { 80, 48, }, 25 { 64, 64, }, 26 { 48, 80, }, 27 { 32, 96, }, 28 { 16, 112, }, 29}; 30 31static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, 32 uint8_t *output_ptr, 33 unsigned int src_pixels_per_line, 34 int pixel_step, 35 unsigned int output_height, 36 unsigned int output_width, 37 const uint8_t *filter) { 38 const uint8x8_t f0 = vmov_n_u8(filter[0]); 39 const uint8x8_t f1 = vmov_n_u8(filter[1]); 40 unsigned int i; 41 for (i = 0; i < output_height; ++i) { 42 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); 43 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); 44 const uint16x8_t a = vmull_u8(src_0, f0); 45 const uint16x8_t b = vmlal_u8(a, src_1, f1); 46 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); 47 vst1_u8(&output_ptr[0], out); 48 // Next row... 49 src_ptr += src_pixels_per_line; 50 output_ptr += output_width; 51 } 52} 53 54static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, 55 uint8_t *output_ptr, 56 unsigned int src_pixels_per_line, 57 int pixel_step, 58 unsigned int output_height, 59 unsigned int output_width, 60 const uint8_t *filter) { 61 const uint8x8_t f0 = vmov_n_u8(filter[0]); 62 const uint8x8_t f1 = vmov_n_u8(filter[1]); 63 unsigned int i, j; 64 for (i = 0; i < output_height; ++i) { 65 for (j = 0; j < output_width; j += 16) { 66 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); 67 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); 68 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); 69 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); 70 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); 71 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); 72 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); 73 const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); 74 vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); 75 } 76 // Next row... 77 src_ptr += src_pixels_per_line; 78 output_ptr += output_width; 79 } 80} 81 82unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, 83 int src_stride, 84 int xoffset, 85 int yoffset, 86 const uint8_t *dst, 87 int dst_stride, 88 unsigned int *sse) { 89 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); 90 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); 91 92 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 93 9, 8, 94 bilinear_filters[xoffset]); 95 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 96 8, bilinear_filters[yoffset]); 97 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); 98} 99 100unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, 101 int src_stride, 102 int xoffset, 103 int yoffset, 104 const uint8_t *dst, 105 int dst_stride, 106 unsigned int *sse) { 107 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); 108 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); 109 110 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 111 17, 16, 112 bilinear_filters[xoffset]); 113 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 114 16, bilinear_filters[yoffset]); 115 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); 116} 117 118unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, 119 int src_stride, 120 int xoffset, 121 int yoffset, 122 const uint8_t *dst, 123 int dst_stride, 124 unsigned int *sse) { 125 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); 126 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); 127 128 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 129 33, 32, 130 bilinear_filters[xoffset]); 131 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 132 32, bilinear_filters[yoffset]); 133 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); 134} 135 136unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, 137 int src_stride, 138 int xoffset, 139 int yoffset, 140 const uint8_t *dst, 141 int dst_stride, 142 unsigned int *sse) { 143 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); 144 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); 145 146 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 147 65, 64, 148 bilinear_filters[xoffset]); 149 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 150 64, bilinear_filters[yoffset]); 151 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse); 152} 153