17bc9febe8749e98a3812a0dc4380ceae75c29450Johann/*
27bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
37bc9febe8749e98a3812a0dc4380ceae75c29450Johann *
47bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  Use of this source code is governed by a BSD-style license
57bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  that can be found in the LICENSE file in the root of the source
67bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  tree. An additional intellectual property rights grant can be found
77bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  in the file PATENTS.  All contributing project authors may
87bc9febe8749e98a3812a0dc4380ceae75c29450Johann *  be found in the AUTHORS file in the root of the source tree.
97bc9febe8749e98a3812a0dc4380ceae75c29450Johann */
107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
117bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include <arm_neon.h>
127bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include <assert.h>
137bc9febe8749e98a3812a0dc4380ceae75c29450Johann
147bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vpx_config.h"
157bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vpx_dsp_rtcd.h"
167bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "vpx/vpx_integer.h"
177bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "vpx_dsp/arm/transpose_neon.h"
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "vpx_ports/mem.h"
197bc9febe8749e98a3812a0dc4380ceae75c29450Johann
207bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
217bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s0 = vld1_s16(s);
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s1 = vld1_s16(s);
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s2 = vld1_s16(s);
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s3 = vld1_s16(s);
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann
317bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
327bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s0 = vld1q_u16(s);
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s1 = vld1q_u16(s);
367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s2 = vld1q_u16(s);
387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s3 = vld1q_u16(s);
407bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
417bc9febe8749e98a3812a0dc4380ceae75c29450Johann
427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
447bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
457bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int16x8_t *s7) {
467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s0 = vld1q_s16(s);
477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s1 = vld1q_s16(s);
497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s2 = vld1q_s16(s);
517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s3 = vld1q_s16(s);
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s4 = vld1q_s16(s);
557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s5 = vld1q_s16(s);
577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s6 = vld1q_s16(s);
597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *s7 = vld1q_s16(s);
617bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann
637bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint16x8_t s1, const uint16x8_t s2,
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint16x8_t s3, const uint16x8_t s4,
667bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint16x8_t s5, const uint16x8_t s6,
677bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             const uint16x8_t s7) {
687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s0);
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s1);
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s2);
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s3);
757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s4);
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s5);
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s6);
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s += p;
827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u16(s, s7);
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
857bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16x4_t s2, const int16x4_t s3,
877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16x4_t s4, const int16x4_t s5,
887bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16x4_t s6, const int16x4_t s7,
897bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16x8_t filters) {
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const int16x4_t filters_lo = vget_low_s16(filters);
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const int16x4_t filters_hi = vget_high_s16(filters);
927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32x4_t sum = vdupq_n_s32(0);
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s4, filters_hi, 0);
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s5, filters_hi, 1);
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s6, filters_hi, 2);
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum = vmlal_lane_s16(sum, s7, filters_hi, 3);
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return sum;
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1057bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
1067bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16x8_t s2, const int16x8_t s3,
1077bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16x8_t s4, const int16x8_t s5,
1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16x8_t s6, const int16x8_t s7,
1097bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16x8_t filters,
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const uint16x8_t max) {
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const int16x4_t filters_lo = vget_low_s16(filters);
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const int16x4_t filters_hi = vget_high_s16(filters);
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32x4_t sum0 = vdupq_n_s32(0);
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32x4_t sum1 = vdupq_n_s32(0);
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t d;
1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
1187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
1197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
1207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
1217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0);
1227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
1237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
1247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
1257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
1267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
1277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0);
1307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1);
1317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2);
1327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3);
1337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7));
1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d = vminq_u16(d, max);
1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return d;
1367bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
1377bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1388b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
1398b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                     uint16_t *dst, ptrdiff_t dst_stride,
1407bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16_t *filter_x, int x_step_q4,
1417bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     const int16_t *filter_y,  // unused
1427bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     int y_step_q4,            // unused
1437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                     int w, int h, int bd) {
1447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (x_step_q4 != 16) {
1458b92989c89bec8632aa47dc58dc162f199d62edcJames Zern    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
1467bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
1477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } else {
1487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const int16x8_t filters = vld1q_s16(filter_x);
1497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
1507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    uint16x8_t t0, t1, t2, t3;
1517bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!((intptr_t)dst & 3));
1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!(dst_stride & 3));
1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src -= 3;
1567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (h == 4) {
1587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
1597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32x4_t d0, d1, d2, d3;
1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d01, d23;
1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 0 * src_stride);
1637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 1 * src_stride);
1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 2 * src_stride);
1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 3 * src_stride);
1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann      transpose_u16_8x4(&t0, &t1, &t2, &t3);
1687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
1697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
1707bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
1717bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
1727bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
1737bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
1747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
1757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 0 * dst_stride);
1767bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 1 * dst_stride);
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 2 * dst_stride);
1787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 3 * dst_stride);
1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += 7;
1807bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
1827bc9febe8749e98a3812a0dc4380ceae75c29450Johann        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
1837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
1847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
1867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
1877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
1887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
1917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
1927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vminq_u16(d01, max);
1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vminq_u16(d23, max);
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        transpose_u16_4x4q(&d01, &d23);
1957bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
1977bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
1987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
1997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
2007bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = s4;
2027bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = s5;
2037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = s6;
2047bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = s7;
2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = s8;
2067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = s9;
2077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = s10;
2087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += 4;
2097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += 4;
2107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        w -= 4;
2117bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (w > 0);
2127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {
2137bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t t4, t5, t6, t7;
2147bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
2157bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d0, d1, d2, d3;
2167bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2177bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (w == 4) {
2187bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
2197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
2207bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &s5, &s6, &s7);
2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
2227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2237bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
2247bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &t4, &t5, &t6, &t7);
2257bc9febe8749e98a3812a0dc4380ceae75c29450Johann          src += 8 * src_stride;
2267bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 0 * dst_stride);
2277bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 1 * dst_stride);
2287bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 2 * dst_stride);
2297bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 3 * dst_stride);
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 4 * dst_stride);
2317bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 5 * dst_stride);
2327bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 6 * dst_stride);
2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 7 * dst_stride);
2347bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
2357bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2367bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 0 * src_stride);
2377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 1 * src_stride);
2387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 2 * src_stride);
2397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 3 * src_stride);
2407bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 4 * src_stride);
2417bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 5 * src_stride);
2427bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 6 * src_stride);
2437bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 7 * src_stride);
2447bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
2457bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
2467bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
2477bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
2487bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_u16_8x4(&d0, &d1, &d2, &d3);
2507bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d0));
2517bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2527bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d1));
2537bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2547bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d2));
2557bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2567bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d3));
2577bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2587bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d0));
2597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2607bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d1));
2617bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2627bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d2));
2637bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2647bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d3));
2657bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
2667bc9febe8749e98a3812a0dc4380ceae75c29450Johann          h -= 8;
2677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (h > 0);
2687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {
2697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        int width;
2707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        const uint16_t *s;
2717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        uint16_t *d;
2727bc9febe8749e98a3812a0dc4380ceae75c29450Johann        int16x8_t s11, s12, s13, s14;
2737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        uint16x8_t d4, d5, d6, d7;
2747bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
2767bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 0 * src_stride);
2777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 1 * src_stride);
2787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 2 * src_stride);
2797bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 3 * src_stride);
2807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 4 * src_stride);
2817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 5 * src_stride);
2827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 6 * src_stride);
2837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 7 * src_stride);
2847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &s5, &s6, &s7);
2867bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
2877bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2887bc9febe8749e98a3812a0dc4380ceae75c29450Johann          width = w;
2897bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s = src + 7;
2907bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d = dst;
2917bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 0 * dst_stride);
2927bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 1 * dst_stride);
2937bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 2 * dst_stride);
2947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 3 * dst_stride);
2957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 4 * dst_stride);
2967bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 5 * dst_stride);
2977bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 6 * dst_stride);
2987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 7 * dst_stride);
2997bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          do {
3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     &s12, &s13, &s14);
3037bc9febe8749e98a3812a0dc4380ceae75c29450Johann            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
3047bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3057bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
3067bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
3077bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
3087bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
3097bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
3107bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
3117bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
3127bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
3137bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3147bc9febe8749e98a3812a0dc4380ceae75c29450Johann            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
3157bc9febe8749e98a3812a0dc4380ceae75c29450Johann            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
3167bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3177bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s0 = s8;
3187bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s1 = s9;
3197bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s2 = s10;
3207bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s3 = s11;
3217bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s4 = s12;
3227bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s5 = s13;
3237bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s6 = s14;
3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s += 8;
3257bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d += 8;
3267bc9febe8749e98a3812a0dc4380ceae75c29450Johann            width -= 8;
3277bc9febe8749e98a3812a0dc4380ceae75c29450Johann          } while (width > 0);
3287bc9febe8749e98a3812a0dc4380ceae75c29450Johann          src += 8 * src_stride;
3297bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += 8 * dst_stride;
3307bc9febe8749e98a3812a0dc4380ceae75c29450Johann          h -= 8;
3317bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (h > 0);
3327bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }
3337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }
3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
3357bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
3367bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3378b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
3388b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                         ptrdiff_t src_stride, uint16_t *dst,
3397bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         ptrdiff_t dst_stride,
3407bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         const int16_t *filter_x, int x_step_q4,
3417bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         const int16_t *filter_y,  // unused
3427bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         int y_step_q4,            // unused
3437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         int w, int h, int bd) {
3447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (x_step_q4 != 16) {
3458b92989c89bec8632aa47dc58dc162f199d62edcJames Zern    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
3468b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
3477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } else {
3487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const int16x8_t filters = vld1q_s16(filter_x);
3497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
3507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    uint16x8_t t0, t1, t2, t3;
3517bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!((intptr_t)dst & 3));
3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!(dst_stride & 3));
3547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src -= 3;
3567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (h == 4) {
3587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
3597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32x4_t d0, d1, d2, d3;
3607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d01, d23, t01, t23;
3617bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 0 * src_stride);
3637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 1 * src_stride);
3647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 2 * src_stride);
3657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + 3 * src_stride);
3667bc9febe8749e98a3812a0dc4380ceae75c29450Johann      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann      transpose_u16_8x4(&t0, &t1, &t2, &t3);
3687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s0 = vreinterpret_s16_u16(vget_low_u16(t0));
3697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s1 = vreinterpret_s16_u16(vget_low_u16(t1));
3707bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s2 = vreinterpret_s16_u16(vget_low_u16(t2));
3717bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s3 = vreinterpret_s16_u16(vget_low_u16(t3));
3727bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s4 = vreinterpret_s16_u16(vget_high_u16(t0));
3737bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s5 = vreinterpret_s16_u16(vget_high_u16(t1));
3747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s6 = vreinterpret_s16_u16(vget_high_u16(t2));
3757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 0 * dst_stride);
3767bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 1 * dst_stride);
3777bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 2 * dst_stride);
3787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(dst + 3 * dst_stride);
3797bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += 7;
3807bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
3827bc9febe8749e98a3812a0dc4380ceae75c29450Johann        load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
3837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        transpose_s16_4x4d(&s7, &s8, &s9, &s10);
3847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
3867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
3877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
3887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
3897bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
3917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
3927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t01 = vminq_u16(t01, max);
3937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t23 = vminq_u16(t23, max);
3947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        transpose_u16_4x4q(&t01, &t23);
3957bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
3977bc9febe8749e98a3812a0dc4380ceae75c29450Johann                           vld1_u16(dst + 2 * dst_stride));
3987bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
3997bc9febe8749e98a3812a0dc4380ceae75c29450Johann                           vld1_u16(dst + 3 * dst_stride));
4007bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vrhaddq_u16(d01, t01);
4017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vrhaddq_u16(d23, t23);
4027bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01));
4047bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23));
4057bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01));
4067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23));
4077bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = s4;
4097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = s5;
4107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = s6;
4117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = s7;
4127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = s8;
4137bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = s9;
4147bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = s10;
4157bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += 4;
4167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += 4;
4177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        w -= 4;
4187bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (w > 0);
4197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {
4207bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t t4, t5, t6, t7;
4217bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
4227bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
4237bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4247bc9febe8749e98a3812a0dc4380ceae75c29450Johann      if (w == 4) {
4257bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
4267bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
4277bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &s5, &s6, &s7);
4287bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
4297bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4307bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10,
4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &t4, &t5, &t6, &t7);
4327bc9febe8749e98a3812a0dc4380ceae75c29450Johann          src += 8 * src_stride;
4337bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 0 * dst_stride);
4347bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 1 * dst_stride);
4357bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 2 * dst_stride);
4367bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 3 * dst_stride);
4377bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 4 * dst_stride);
4387bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 5 * dst_stride);
4397bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 6 * dst_stride);
4407bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 7 * dst_stride);
4417bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7);
4427bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4437bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 0 * src_stride);
4447bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 1 * src_stride);
4457bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 2 * src_stride);
4467bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 3 * src_stride);
4477bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 4 * src_stride);
4487bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 5 * src_stride);
4497bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 6 * src_stride);
4507bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 7 * src_stride);
4517bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
4527bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
4537bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
4547bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
4557bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_u16_8x4(&t0, &t1, &t2, &t3);
4567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4577bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
4587bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            vld1_u16(dst + 4 * dst_stride));
4597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride),
4607bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            vld1_u16(dst + 5 * dst_stride));
4617bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
4627bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            vld1_u16(dst + 6 * dst_stride));
4637bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride),
4647bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            vld1_u16(dst + 7 * dst_stride));
4657bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = vrhaddq_u16(d0, t0);
4667bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = vrhaddq_u16(d1, t1);
4677bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = vrhaddq_u16(d2, t2);
4687bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = vrhaddq_u16(d3, t3);
4697bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4707bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d0));
4717bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4727bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d1));
4737bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4747bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d2));
4757bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4767bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_low_u16(d3));
4777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d0));
4797bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d1));
4817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d2));
4837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1_u16(dst, vget_high_u16(d3));
4857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += dst_stride;
4867bc9febe8749e98a3812a0dc4380ceae75c29450Johann          h -= 8;
4877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (h > 0);
4887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } else {
4897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        int width;
4907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        const uint16_t *s;
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        uint16_t *d;
4927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        int16x8_t s11, s12, s13, s14;
4937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        uint16x8_t d4, d5, d6, d7;
4947bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4957bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
4967bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 0 * src_stride);
4977bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 1 * src_stride);
4987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 2 * src_stride);
4997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 3 * src_stride);
5007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 4 * src_stride);
5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 5 * src_stride);
5027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 6 * src_stride);
5037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(src + 7 * src_stride);
5047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4,
5057bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   &s5, &s6, &s7);
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann          transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
5077bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5087bc9febe8749e98a3812a0dc4380ceae75c29450Johann          width = w;
5097bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s = src + 7;
5107bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d = dst;
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 0 * dst_stride);
5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 1 * dst_stride);
5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 2 * dst_stride);
5147bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 3 * dst_stride);
5157bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 4 * dst_stride);
5167bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 5 * dst_stride);
5177bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 6 * dst_stride);
5187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(dst + 7 * dst_stride);
5197bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5207bc9febe8749e98a3812a0dc4380ceae75c29450Johann          do {
5217bc9febe8749e98a3812a0dc4380ceae75c29450Johann            load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11,
5227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                     &s12, &s13, &s14);
5237bc9febe8749e98a3812a0dc4380ceae75c29450Johann            transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
5247bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5257bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
5267bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
5277bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
5287bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
5297bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
5317bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
5337bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann            transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
5357bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5367bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride));
5377bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride));
5387bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride));
5397bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride));
5407bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride));
5417bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride));
5427bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride));
5437bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride));
5447bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann            store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s0 = s8;
5487bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s1 = s9;
5497bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s2 = s10;
5507bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s3 = s11;
5517bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s4 = s12;
5527bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s5 = s13;
5537bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s6 = s14;
5547bc9febe8749e98a3812a0dc4380ceae75c29450Johann            s += 8;
5557bc9febe8749e98a3812a0dc4380ceae75c29450Johann            d += 8;
5567bc9febe8749e98a3812a0dc4380ceae75c29450Johann            width -= 8;
5577bc9febe8749e98a3812a0dc4380ceae75c29450Johann          } while (width > 0);
5587bc9febe8749e98a3812a0dc4380ceae75c29450Johann          src += 8 * src_stride;
5597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          dst += 8 * dst_stride;
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann          h -= 8;
5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (h > 0);
5627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }
5637bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }
5647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5678b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
5688b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                    uint16_t *dst, ptrdiff_t dst_stride,
5697bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16_t *filter_x,  // unused
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    int x_step_q4,            // unused
5717bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    const int16_t *filter_y, int y_step_q4,
5727bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                    int w, int h, int bd) {
5737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (y_step_q4 != 16) {
5748b92989c89bec8632aa47dc58dc162f199d62edcJames Zern    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                x_step_q4, filter_y, y_step_q4, w, h, bd);
5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } else {
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const int16x8_t filters = vld1q_s16(filter_y);
5787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
5797bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!((intptr_t)dst & 3));
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!(dst_stride & 3));
5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src -= 3 * src_stride;
5847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (w == 4) {
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32x4_t d0, d1, d2, d3;
5887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d01, d23;
5897bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5907bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s0 = vreinterpret_s16_u16(vld1_u16(src));
5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s1 = vreinterpret_s16_u16(vld1_u16(src));
5937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
5947bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s2 = vreinterpret_s16_u16(vld1_u16(src));
5957bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s3 = vreinterpret_s16_u16(vld1_u16(src));
5977bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
5987bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s4 = vreinterpret_s16_u16(vld1_u16(src));
5997bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
6007bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s5 = vreinterpret_s16_u16(vld1_u16(src));
6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s6 = vreinterpret_s16_u16(vld1_u16(src));
6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
6047bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6057bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
6067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s7 = vreinterpret_s16_u16(vld1_u16(src));
6077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
6087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s8 = vreinterpret_s16_u16(vld1_u16(src));
6097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
6107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s9 = vreinterpret_s16_u16(vld1_u16(src));
6117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
6127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s10 = vreinterpret_s16_u16(vld1_u16(src));
6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
6147bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6157bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 0 * dst_stride);
6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 1 * dst_stride);
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 2 * dst_stride);
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 3 * dst_stride);
6197bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 0 * src_stride);
6207bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 1 * src_stride);
6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 2 * src_stride);
6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 3 * src_stride);
6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
6247bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
6257bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
6267bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
6277bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
6297bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
6307bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vminq_u16(d01, max);
6317bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vminq_u16(d23, max);
6327bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_low_u16(d01));
6337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
6347bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_high_u16(d01));
6357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
6367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_low_u16(d23));
6377bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
6387bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_high_u16(d23));
6397bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
6407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6417bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = s4;
6427bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = s5;
6437bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = s6;
6447bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = s7;
6457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = s8;
6467bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = s9;
6477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = s10;
6487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        h -= 4;
6497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (h > 0);
6507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {
6517bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int height;
6527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      const uint16_t *s;
6537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16_t *d;
6547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
6557bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d0, d1, d2, d3;
6567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
6587bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 0 * src_stride);
6597bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 1 * src_stride);
6607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 2 * src_stride);
6617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 3 * src_stride);
6627bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 4 * src_stride);
6637bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 5 * src_stride);
6647bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 6 * src_stride);
6657bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s = src;
6667bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
6677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
6697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
6717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6727bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
6737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
6757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
6777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6787bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
6797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
6807bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d = dst;
6817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        height = h;
6827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
6847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
6857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
6867bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
6877bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
6887bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
6897bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
6907bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
6917bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
6927bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6937bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 0 * dst_stride);
6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 1 * dst_stride);
6957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 2 * dst_stride);
6967bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 3 * dst_stride);
6977bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 0 * src_stride);
6987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 1 * src_stride);
6997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 2 * src_stride);
7007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 3 * src_stride);
7017bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
7027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
7037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
7047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
7057bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7067bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d0);
7077bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
7087bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d1);
7097bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
7107bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d2);
7117bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
7127bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d3);
7137bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
7147bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7157bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s0 = s4;
7167bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s1 = s5;
7177bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s2 = s6;
7187bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s3 = s7;
7197bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s4 = s8;
7207bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s5 = s9;
7217bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s6 = s10;
7227bc9febe8749e98a3812a0dc4380ceae75c29450Johann          height -= 4;
7237bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (height > 0);
7247bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += 8;
7257bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += 8;
7267bc9febe8749e98a3812a0dc4380ceae75c29450Johann        w -= 8;
7277bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (w > 0);
7287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }
7297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
7307bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
7317bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7328b92989c89bec8632aa47dc58dc162f199d62edcJames Zernvoid vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
7338b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                        ptrdiff_t src_stride, uint16_t *dst,
7347bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        ptrdiff_t dst_stride,
7357bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        const int16_t *filter_x,  // unused
7367bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        int x_step_q4,            // unused
7377bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        const int16_t *filter_y, int y_step_q4,
7387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                        int w, int h, int bd) {
7397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (y_step_q4 != 16) {
7408b92989c89bec8632aa47dc58dc162f199d62edcJames Zern    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
7418b92989c89bec8632aa47dc58dc162f199d62edcJames Zern                                    x_step_q4, filter_y, y_step_q4, w, h, bd);
7427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } else {
7437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const int16x8_t filters = vld1q_s16(filter_y);
7447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
7457bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!((intptr_t)dst & 3));
7477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    assert(!(dst_stride & 3));
7487bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src -= 3 * src_stride;
7507bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (w == 4) {
7527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
7537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int32x4_t d0, d1, d2, d3;
7547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d01, d23, t01, t23;
7557bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7567bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s0 = vreinterpret_s16_u16(vld1_u16(src));
7577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s1 = vreinterpret_s16_u16(vld1_u16(src));
7597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s2 = vreinterpret_s16_u16(vld1_u16(src));
7617bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s3 = vreinterpret_s16_u16(vld1_u16(src));
7637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s4 = vreinterpret_s16_u16(vld1_u16(src));
7657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7667bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s5 = vreinterpret_s16_u16(vld1_u16(src));
7677bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      s6 = vreinterpret_s16_u16(vld1_u16(src));
7697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_stride;
7707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7717bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
7727bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s7 = vreinterpret_s16_u16(vld1_u16(src));
7737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
7747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s8 = vreinterpret_s16_u16(vld1_u16(src));
7757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
7767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s9 = vreinterpret_s16_u16(vld1_u16(src));
7777bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
7787bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s10 = vreinterpret_s16_u16(vld1_u16(src));
7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_stride;
7807bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7817bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 0 * dst_stride);
7827bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 1 * dst_stride);
7837bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 2 * dst_stride);
7847bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(dst + 3 * dst_stride);
7857bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 0 * src_stride);
7867bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 1 * src_stride);
7877bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 2 * src_stride);
7887bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 3 * src_stride);
7897bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
7907bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
7917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
7927bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
7937bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7947bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
7957bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
7967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t01 = vminq_u16(t01, max);
7977bc9febe8749e98a3812a0dc4380ceae75c29450Johann        t23 = vminq_u16(t23, max);
7987bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7997bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
8007bc9febe8749e98a3812a0dc4380ceae75c29450Johann                           vld1_u16(dst + 1 * dst_stride));
8017bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride),
8027bc9febe8749e98a3812a0dc4380ceae75c29450Johann                           vld1_u16(dst + 3 * dst_stride));
8037bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d01 = vrhaddq_u16(d01, t01);
8047bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23 = vrhaddq_u16(d23, t23);
8057bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_low_u16(d01));
8077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
8087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_high_u16(d01));
8097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
8107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_low_u16(d23));
8117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
8127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u16(dst, vget_high_u16(d23));
8137bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_stride;
8147bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8157bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = s4;
8167bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = s5;
8177bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = s6;
8187bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = s7;
8197bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = s8;
8207bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = s9;
8217bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = s10;
8227bc9febe8749e98a3812a0dc4380ceae75c29450Johann        h -= 4;
8237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (h > 0);
8247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {
8257bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int height;
8267bc9febe8749e98a3812a0dc4380ceae75c29450Johann      const uint16_t *s;
8277bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16_t *d;
8287bc9febe8749e98a3812a0dc4380ceae75c29450Johann      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
8297bc9febe8749e98a3812a0dc4380ceae75c29450Johann      uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3;
8307bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8317bc9febe8749e98a3812a0dc4380ceae75c29450Johann      do {
8327bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 0 * src_stride);
8337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 1 * src_stride);
8347bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 2 * src_stride);
8357bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 3 * src_stride);
8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 4 * src_stride);
8377bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 5 * src_stride);
8387bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __builtin_prefetch(src + 6 * src_stride);
8397bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s = src;
8407bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s0 = vreinterpretq_s16_u16(vld1q_u16(s));
8417bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8427bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s1 = vreinterpretq_s16_u16(vld1q_u16(s));
8437bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8447bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s2 = vreinterpretq_s16_u16(vld1q_u16(s));
8457bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8467bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s3 = vreinterpretq_s16_u16(vld1q_u16(s));
8477bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s4 = vreinterpretq_s16_u16(vld1q_u16(s));
8497bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8507bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s5 = vreinterpretq_s16_u16(vld1q_u16(s));
8517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8527bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s6 = vreinterpretq_s16_u16(vld1q_u16(s));
8537bc9febe8749e98a3812a0dc4380ceae75c29450Johann        s += src_stride;
8547bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d = dst;
8557bc9febe8749e98a3812a0dc4380ceae75c29450Johann        height = h;
8567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8577bc9febe8749e98a3812a0dc4380ceae75c29450Johann        do {
8587bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s7 = vreinterpretq_s16_u16(vld1q_u16(s));
8597bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
8607bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s8 = vreinterpretq_s16_u16(vld1q_u16(s));
8617bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
8627bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s9 = vreinterpretq_s16_u16(vld1q_u16(s));
8637bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
8647bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s10 = vreinterpretq_s16_u16(vld1q_u16(s));
8657bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s += src_stride;
8667bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8677bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 0 * dst_stride);
8687bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 1 * dst_stride);
8697bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 2 * dst_stride);
8707bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(d + 3 * dst_stride);
8717bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 0 * src_stride);
8727bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 1 * src_stride);
8737bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 2 * src_stride);
8747bc9febe8749e98a3812a0dc4380ceae75c29450Johann          __builtin_prefetch(s + 3 * src_stride);
8757bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
8767bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
8777bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
8787bc9febe8749e98a3812a0dc4380ceae75c29450Johann          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
8797bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = vld1q_u16(d + 0 * dst_stride);
8817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = vld1q_u16(d + 1 * dst_stride);
8827bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = vld1q_u16(d + 2 * dst_stride);
8837bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = vld1q_u16(d + 3 * dst_stride);
8847bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d0 = vrhaddq_u16(d0, t0);
8857bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d1 = vrhaddq_u16(d1, t1);
8867bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d2 = vrhaddq_u16(d2, t2);
8877bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d3 = vrhaddq_u16(d3, t3);
8887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8897bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d0);
8907bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
8917bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d1);
8927bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
8937bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d2);
8947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
8957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          vst1q_u16(d, d3);
8967bc9febe8749e98a3812a0dc4380ceae75c29450Johann          d += dst_stride;
8977bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8987bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s0 = s4;
8997bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s1 = s5;
9007bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s2 = s6;
9017bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s3 = s7;
9027bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s4 = s8;
9037bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s5 = s9;
9047bc9febe8749e98a3812a0dc4380ceae75c29450Johann          s6 = s10;
9057bc9febe8749e98a3812a0dc4380ceae75c29450Johann          height -= 4;
9067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        } while (height > 0);
9077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += 8;
9087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += 8;
9097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        w -= 8;
9107bc9febe8749e98a3812a0dc4380ceae75c29450Johann      } while (w > 0);
9117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }
9127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
9137bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
914