191037db265ecdd914a26e056cf69207b4f50924ehkuang/* 291037db265ecdd914a26e056cf69207b4f50924ehkuang * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 391037db265ecdd914a26e056cf69207b4f50924ehkuang * 491037db265ecdd914a26e056cf69207b4f50924ehkuang * Use of this source code is governed by a BSD-style license 591037db265ecdd914a26e056cf69207b4f50924ehkuang * that can be found in the LICENSE file in the root of the source 691037db265ecdd914a26e056cf69207b4f50924ehkuang * tree. An additional intellectual property rights grant can be found 791037db265ecdd914a26e056cf69207b4f50924ehkuang * in the file PATENTS. All contributing project authors may 891037db265ecdd914a26e056cf69207b4f50924ehkuang * be found in the AUTHORS file in the root of the source tree. 991037db265ecdd914a26e056cf69207b4f50924ehkuang */ 1091037db265ecdd914a26e056cf69207b4f50924ehkuang 1191037db265ecdd914a26e056cf69207b4f50924ehkuang#include "./vp9_rtcd.h" 1291037db265ecdd914a26e056cf69207b4f50924ehkuang#include "vp9/common/vp9_common.h" 133df0563f1b24dac6c0bd122fc922a48211269061hkuang#include "vpx_ports/mem.h" 1491037db265ecdd914a26e056cf69207b4f50924ehkuang 1591037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, 1691037db265ecdd914a26e056cf69207b4f50924ehkuang uint8_t *dst, ptrdiff_t dst_stride, 1791037db265ecdd914a26e056cf69207b4f50924ehkuang const int16_t *filter_x, int x_step_q4, 1891037db265ecdd914a26e056cf69207b4f50924ehkuang const int16_t *filter_y, int y_step_q4, 1991037db265ecdd914a26e056cf69207b4f50924ehkuang int w, int h) { 2091037db265ecdd914a26e056cf69207b4f50924ehkuang /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the 2191037db265ecdd914a26e056cf69207b4f50924ehkuang * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). 2291037db265ecdd914a26e056cf69207b4f50924ehkuang */ 233df0563f1b24dac6c0bd122fc922a48211269061hkuang DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); 2491037db265ecdd914a26e056cf69207b4f50924ehkuang 2591037db265ecdd914a26e056cf69207b4f50924ehkuang // Account for the vertical phase needing 3 lines prior and 4 lines post 2691037db265ecdd914a26e056cf69207b4f50924ehkuang int intermediate_height = h + 7; 2791037db265ecdd914a26e056cf69207b4f50924ehkuang 2891037db265ecdd914a26e056cf69207b4f50924ehkuang if (x_step_q4 != 16 || y_step_q4 != 16) 2991037db265ecdd914a26e056cf69207b4f50924ehkuang return vp9_convolve8_c(src, src_stride, 3091037db265ecdd914a26e056cf69207b4f50924ehkuang dst, dst_stride, 3191037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, 3291037db265ecdd914a26e056cf69207b4f50924ehkuang filter_y, y_step_q4, 3391037db265ecdd914a26e056cf69207b4f50924ehkuang w, h); 3491037db265ecdd914a26e056cf69207b4f50924ehkuang 3591037db265ecdd914a26e056cf69207b4f50924ehkuang /* Filter starting 3 lines back. The neon implementation will ignore the 3691037db265ecdd914a26e056cf69207b4f50924ehkuang * given height and filter a multiple of 4 lines. Since this goes in to 3791037db265ecdd914a26e056cf69207b4f50924ehkuang * the temp buffer which has lots of extra room and is subsequently discarded 3891037db265ecdd914a26e056cf69207b4f50924ehkuang * this is safe if somewhat less than ideal. 3991037db265ecdd914a26e056cf69207b4f50924ehkuang */ 4091037db265ecdd914a26e056cf69207b4f50924ehkuang vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, 4191037db265ecdd914a26e056cf69207b4f50924ehkuang temp, 64, 4291037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, filter_y, y_step_q4, 4391037db265ecdd914a26e056cf69207b4f50924ehkuang w, intermediate_height); 4491037db265ecdd914a26e056cf69207b4f50924ehkuang 4591037db265ecdd914a26e056cf69207b4f50924ehkuang /* Step into the temp buffer 3 lines to get the actual frame data */ 4691037db265ecdd914a26e056cf69207b4f50924ehkuang vp9_convolve8_vert_neon(temp + 64 * 3, 64, 4791037db265ecdd914a26e056cf69207b4f50924ehkuang dst, dst_stride, 4891037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, filter_y, y_step_q4, 4991037db265ecdd914a26e056cf69207b4f50924ehkuang w, h); 5091037db265ecdd914a26e056cf69207b4f50924ehkuang} 5191037db265ecdd914a26e056cf69207b4f50924ehkuang 5291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, 5391037db265ecdd914a26e056cf69207b4f50924ehkuang uint8_t *dst, ptrdiff_t dst_stride, 5491037db265ecdd914a26e056cf69207b4f50924ehkuang const int16_t *filter_x, int x_step_q4, 5591037db265ecdd914a26e056cf69207b4f50924ehkuang const int16_t *filter_y, int y_step_q4, 5691037db265ecdd914a26e056cf69207b4f50924ehkuang int w, int h) { 573df0563f1b24dac6c0bd122fc922a48211269061hkuang DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72); 5891037db265ecdd914a26e056cf69207b4f50924ehkuang int intermediate_height = h + 7; 5991037db265ecdd914a26e056cf69207b4f50924ehkuang 6091037db265ecdd914a26e056cf69207b4f50924ehkuang if (x_step_q4 != 16 || y_step_q4 != 16) 6191037db265ecdd914a26e056cf69207b4f50924ehkuang return vp9_convolve8_avg_c(src, src_stride, 6291037db265ecdd914a26e056cf69207b4f50924ehkuang dst, dst_stride, 6391037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, 6491037db265ecdd914a26e056cf69207b4f50924ehkuang filter_y, y_step_q4, 6591037db265ecdd914a26e056cf69207b4f50924ehkuang w, h); 6691037db265ecdd914a26e056cf69207b4f50924ehkuang 6791037db265ecdd914a26e056cf69207b4f50924ehkuang /* This implementation has the same issues as above. In addition, we only want 6891037db265ecdd914a26e056cf69207b4f50924ehkuang * to average the values after both passes. 6991037db265ecdd914a26e056cf69207b4f50924ehkuang */ 7091037db265ecdd914a26e056cf69207b4f50924ehkuang vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, 7191037db265ecdd914a26e056cf69207b4f50924ehkuang temp, 64, 7291037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, filter_y, y_step_q4, 7391037db265ecdd914a26e056cf69207b4f50924ehkuang w, intermediate_height); 7491037db265ecdd914a26e056cf69207b4f50924ehkuang vp9_convolve8_avg_vert_neon(temp + 64 * 3, 7591037db265ecdd914a26e056cf69207b4f50924ehkuang 64, dst, dst_stride, 7691037db265ecdd914a26e056cf69207b4f50924ehkuang filter_x, x_step_q4, filter_y, y_step_q4, 7791037db265ecdd914a26e056cf69207b4f50924ehkuang w, h); 7891037db265ecdd914a26e056cf69207b4f50924ehkuang} 79