191037db265ecdd914a26e056cf69207b4f50924ehkuang/*
291037db265ecdd914a26e056cf69207b4f50924ehkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang *
491037db265ecdd914a26e056cf69207b4f50924ehkuang *  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang *  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang *  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang *  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang *  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang */
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
1191037db265ecdd914a26e056cf69207b4f50924ehkuang#include "./vp9_rtcd.h"
1291037db265ecdd914a26e056cf69207b4f50924ehkuang#include "vp9/common/vp9_common.h"
133df0563f1b24dac6c0bd122fc922a48211269061hkuang#include "vpx_ports/mem.h"
1491037db265ecdd914a26e056cf69207b4f50924ehkuang
1591037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
1691037db265ecdd914a26e056cf69207b4f50924ehkuang                        uint8_t *dst, ptrdiff_t dst_stride,
1791037db265ecdd914a26e056cf69207b4f50924ehkuang                        const int16_t *filter_x, int x_step_q4,
1891037db265ecdd914a26e056cf69207b4f50924ehkuang                        const int16_t *filter_y, int y_step_q4,
1991037db265ecdd914a26e056cf69207b4f50924ehkuang                        int w, int h) {
2091037db265ecdd914a26e056cf69207b4f50924ehkuang  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
2191037db265ecdd914a26e056cf69207b4f50924ehkuang   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
2291037db265ecdd914a26e056cf69207b4f50924ehkuang   */
233df0563f1b24dac6c0bd122fc922a48211269061hkuang  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
2491037db265ecdd914a26e056cf69207b4f50924ehkuang
2591037db265ecdd914a26e056cf69207b4f50924ehkuang  // Account for the vertical phase needing 3 lines prior and 4 lines post
2691037db265ecdd914a26e056cf69207b4f50924ehkuang  int intermediate_height = h + 7;
2791037db265ecdd914a26e056cf69207b4f50924ehkuang
2891037db265ecdd914a26e056cf69207b4f50924ehkuang  if (x_step_q4 != 16 || y_step_q4 != 16)
2991037db265ecdd914a26e056cf69207b4f50924ehkuang    return vp9_convolve8_c(src, src_stride,
3091037db265ecdd914a26e056cf69207b4f50924ehkuang                           dst, dst_stride,
3191037db265ecdd914a26e056cf69207b4f50924ehkuang                           filter_x, x_step_q4,
3291037db265ecdd914a26e056cf69207b4f50924ehkuang                           filter_y, y_step_q4,
3391037db265ecdd914a26e056cf69207b4f50924ehkuang                           w, h);
3491037db265ecdd914a26e056cf69207b4f50924ehkuang
3591037db265ecdd914a26e056cf69207b4f50924ehkuang  /* Filter starting 3 lines back. The neon implementation will ignore the
3691037db265ecdd914a26e056cf69207b4f50924ehkuang   * given height and filter a multiple of 4 lines. Since this goes in to
3791037db265ecdd914a26e056cf69207b4f50924ehkuang   * the temp buffer which has lots of extra room and is subsequently discarded
3891037db265ecdd914a26e056cf69207b4f50924ehkuang   * this is safe if somewhat less than ideal.
3991037db265ecdd914a26e056cf69207b4f50924ehkuang   */
4091037db265ecdd914a26e056cf69207b4f50924ehkuang  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
4191037db265ecdd914a26e056cf69207b4f50924ehkuang                           temp, 64,
4291037db265ecdd914a26e056cf69207b4f50924ehkuang                           filter_x, x_step_q4, filter_y, y_step_q4,
4391037db265ecdd914a26e056cf69207b4f50924ehkuang                           w, intermediate_height);
4491037db265ecdd914a26e056cf69207b4f50924ehkuang
4591037db265ecdd914a26e056cf69207b4f50924ehkuang  /* Step into the temp buffer 3 lines to get the actual frame data */
4691037db265ecdd914a26e056cf69207b4f50924ehkuang  vp9_convolve8_vert_neon(temp + 64 * 3, 64,
4791037db265ecdd914a26e056cf69207b4f50924ehkuang                          dst, dst_stride,
4891037db265ecdd914a26e056cf69207b4f50924ehkuang                          filter_x, x_step_q4, filter_y, y_step_q4,
4991037db265ecdd914a26e056cf69207b4f50924ehkuang                          w, h);
5091037db265ecdd914a26e056cf69207b4f50924ehkuang}
5191037db265ecdd914a26e056cf69207b4f50924ehkuang
5291037db265ecdd914a26e056cf69207b4f50924ehkuangvoid vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
5391037db265ecdd914a26e056cf69207b4f50924ehkuang                            uint8_t *dst, ptrdiff_t dst_stride,
5491037db265ecdd914a26e056cf69207b4f50924ehkuang                            const int16_t *filter_x, int x_step_q4,
5591037db265ecdd914a26e056cf69207b4f50924ehkuang                            const int16_t *filter_y, int y_step_q4,
5691037db265ecdd914a26e056cf69207b4f50924ehkuang                            int w, int h) {
573df0563f1b24dac6c0bd122fc922a48211269061hkuang  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
5891037db265ecdd914a26e056cf69207b4f50924ehkuang  int intermediate_height = h + 7;
5991037db265ecdd914a26e056cf69207b4f50924ehkuang
6091037db265ecdd914a26e056cf69207b4f50924ehkuang  if (x_step_q4 != 16 || y_step_q4 != 16)
6191037db265ecdd914a26e056cf69207b4f50924ehkuang    return vp9_convolve8_avg_c(src, src_stride,
6291037db265ecdd914a26e056cf69207b4f50924ehkuang                               dst, dst_stride,
6391037db265ecdd914a26e056cf69207b4f50924ehkuang                               filter_x, x_step_q4,
6491037db265ecdd914a26e056cf69207b4f50924ehkuang                               filter_y, y_step_q4,
6591037db265ecdd914a26e056cf69207b4f50924ehkuang                               w, h);
6691037db265ecdd914a26e056cf69207b4f50924ehkuang
6791037db265ecdd914a26e056cf69207b4f50924ehkuang  /* This implementation has the same issues as above. In addition, we only want
6891037db265ecdd914a26e056cf69207b4f50924ehkuang   * to average the values after both passes.
6991037db265ecdd914a26e056cf69207b4f50924ehkuang   */
7091037db265ecdd914a26e056cf69207b4f50924ehkuang  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
7191037db265ecdd914a26e056cf69207b4f50924ehkuang                           temp, 64,
7291037db265ecdd914a26e056cf69207b4f50924ehkuang                           filter_x, x_step_q4, filter_y, y_step_q4,
7391037db265ecdd914a26e056cf69207b4f50924ehkuang                           w, intermediate_height);
7491037db265ecdd914a26e056cf69207b4f50924ehkuang  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
7591037db265ecdd914a26e056cf69207b4f50924ehkuang                              64, dst, dst_stride,
7691037db265ecdd914a26e056cf69207b4f50924ehkuang                              filter_x, x_step_q4, filter_y, y_step_q4,
7791037db265ecdd914a26e056cf69207b4f50924ehkuang                              w, h);
7891037db265ecdd914a26e056cf69207b4f50924ehkuang}
79