1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <arm_neon.h>
127bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include <string.h>
137bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vpx_config.h"
140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/arm/mem_neon.h"
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic const int8_t vp8_sub_pel_filters[8][8] = {
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, -9, 93, 50, -6, 0, 0, 0 },
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, -6, 50, 93, -9, 0, 0, 0 },
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, -1, 12, 123, -6, 0, 0, 0 },
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann// This table is derived from vp8/common/filter.c:vp8_sub_pel_filters.
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann// Apply abs() to all the values. Elements 0, 2, 3, and 5 are always positive.
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann// Elements 1 and 4 are either 0 or negative. The code accounts for this with
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann// multiply/accumulates which either add or subtract as needed. The other
327bc9febe8749e98a3812a0dc4380ceae75c29450Johann// functions will be updated to use this table later.
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann// It is also expanded to 8 elements to allow loading into 64 bit neon
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann// registers.
357bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic const uint8_t abs_filters[8][8] = {
367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 0, 0, 128, 0, 0, 0, 0, 0 },   { 0, 6, 123, 12, 1, 0, 0, 0 },
377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 2, 11, 108, 36, 8, 1, 0, 0 }, { 0, 9, 93, 50, 6, 0, 0, 0 },
387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 3, 16, 77, 77, 16, 3, 0, 0 }, { 0, 6, 50, 93, 9, 0, 0, 0 },
397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  { 1, 8, 36, 108, 11, 2, 0, 0 }, { 0, 1, 12, 123, 6, 0, 0, 0 },
407bc9febe8749e98a3812a0dc4380ceae75c29450Johann};
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
427bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE uint8x8_t load_and_shift(const unsigned char *a) {
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
447bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
467bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
477bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         const uint8x8_t filter, uint16x8_t *c,
487bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         uint16x8_t *d) {
497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
507bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       vreinterpret_u32_u8(vget_high_u8(a)));
517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
527bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       vreinterpret_u32_u8(vget_high_u8(b)));
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *c = vmlal_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *d = vmlal_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
557bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
577bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void filter_sub_accumulate(const uint8x16_t a, const uint8x16_t b,
587bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         const uint8x8_t filter, uint16x8_t *c,
597bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                         uint16x8_t *d) {
607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint32x2x2_t a_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a)),
617bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       vreinterpret_u32_u8(vget_high_u8(a)));
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint32x2x2_t b_shuf = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b)),
637bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       vreinterpret_u32_u8(vget_high_u8(b)));
647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *c = vmlsl_u8(*c, vreinterpret_u8_u32(a_shuf.val[0]), filter);
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  *d = vmlsl_u8(*d, vreinterpret_u8_u32(b_shuf.val[0]), filter);
667bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
677bc9febe8749e98a3812a0dc4380ceae75c29450Johann
687bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic INLINE void yonly4x4(const unsigned char *src, int src_stride,
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int filter_offset, unsigned char *dst,
707bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int dst_stride) {
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7, a8;
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t c0, c1, c2, c3;
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t d0, d1;
757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t e0, e1;
767bc9febe8749e98a3812a0dc4380ceae75c29450Johann
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter = vld1_u8(abs_filters[filter_offset]);
787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter0 = vdup_lane_u8(filter, 0);
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter1 = vdup_lane_u8(filter, 1);
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter2 = vdup_lane_u8(filter, 2);
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter3 = vdup_lane_u8(filter, 3);
827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter4 = vdup_lane_u8(filter, 4);
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  const uint8x8_t filter5 = vdup_lane_u8(filter, 5);
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src -= src_stride * 2;
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Shift the even rows to allow using 'vext' to combine the vectors. armv8
877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // has vcopy_lane which would be interesting. This started as just a
887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // horrible workaround for clang adding alignment hints to 32bit loads:
897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // https://llvm.org/bugs/show_bug.cgi?id=24421
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // But it turns out it almost identical to casting the loads.
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a0 = load_and_shift(src);
927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a1 = vld1_u8(src);
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a2 = load_and_shift(src);
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a3 = vld1_u8(src);
987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a4 = load_and_shift(src);
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a5 = vld1_u8(src);
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a6 = load_and_shift(src);
1047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
1057bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a7 = vld1_u8(src);
1067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_stride;
1077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  a8 = vld1_u8(src);
1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Combine the rows so we can operate on 8 at a time.
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b0 = vext_u8(a0, a1, 4);
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b2 = vext_u8(a2, a3, 4);
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b4 = vext_u8(a4, a5, 4);
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b6 = vext_u8(a6, a7, 4);
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b8 = a8;
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // To keep with the 8-at-a-time theme, combine *alternate* rows. This
1177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // allows combining the odd rows with the even.
1187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b1 = vext_u8(b0, b2, 4);
1197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b3 = vext_u8(b2, b4, 4);
1207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b5 = vext_u8(b4, b6, 4);
1217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b7 = vext_u8(b6, b8, 4);
1227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Multiply and expand to 16 bits.
1247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmull_u8(b0, filter0);
1257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmull_u8(b2, filter0);
1267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmull_u8(b5, filter5);
1277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmull_u8(b7, filter5);
1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Multiply, subtract and accumulate for filters 1 and 4 (the negative
1307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // ones).
1317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmlsl_u8(c0, b4, filter4);
1327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmlsl_u8(c1, b6, filter4);
1337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmlsl_u8(c2, b1, filter1);
1347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmlsl_u8(c3, b3, filter1);
1357bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Add more positive ones. vmlal should really return a signed type.
1377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // It's doing signed math internally, as evidenced by the fact we can do
1387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // subtractions followed by more additions. Ideally we could use
1397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // vqmlal/sl but that instruction doesn't exist. Might be able to
1407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // shoehorn vqdmlal/vqdmlsl in here but it would take some effort.
1417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmlal_u8(c0, b2, filter2);
1427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmlal_u8(c1, b4, filter2);
1437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmlal_u8(c2, b3, filter3);
1447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmlal_u8(c3, b5, filter3);
1457bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Use signed saturation math because vmlsl may have left some negative
1477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // numbers in there.
1487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
1497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
1507bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Use signed again because numbers like -200 need to be saturated to 0.
1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e0 = vqrshrun_n_s16(d0, 7);
1537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e1 = vqrshrun_n_s16(d1, 7);
1547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
1567bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1587bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
1597bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                int xoffset, int yoffset,
1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                unsigned char *dst_ptr, int dst_pitch) {
1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t s0, s1, s2, s3, s4;
1627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint64x2_t s01, s23;
1637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Variables to hold src[] elements for the given filter[]
1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t s0_f5, s1_f5, s2_f5, s3_f5, s4_f5;
1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t s4_f1, s4_f2, s4_f3, s4_f4;
1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t s01_f0, s23_f0;
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint64x2_t s01_f3, s23_f3;
1687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32x2x2_t s01_f3_q, s23_f3_q, s01_f5_q, s23_f5_q;
1697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Accumulator variables.
1707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t d0123, d4567, d89;
1717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t d0123_a, d4567_a, d89_a;
1727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t e0123, e4567, e89;
1737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Second pass intermediates.
1747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7, b8;
1757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t c0, c1, c2, c3;
1767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t d0, d1;
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t e0, e1;
1787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t filter, filter0, filter1, filter2, filter3, filter4, filter5;
1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (xoffset == 0) {  // Second pass only.
1817bc9febe8749e98a3812a0dc4380ceae75c29450Johann    yonly4x4(src_ptr, src_pixels_per_line, yoffset, dst_ptr, dst_pitch);
1827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
1837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
1847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0) {  // First pass only.
1867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src_ptr -= 2;
1877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  } else {  // Add context for the second pass. 2 extra lines on top.
1887bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src_ptr -= 2 + (src_pixels_per_line * 2);
1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
1907bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter = vld1_u8(abs_filters[xoffset]);
1927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter0 = vdup_lane_u8(filter, 0);
1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter1 = vdup_lane_u8(filter, 1);
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter2 = vdup_lane_u8(filter, 2);
1957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter3 = vdup_lane_u8(filter, 3);
1967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter4 = vdup_lane_u8(filter, 4);
1977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter5 = vdup_lane_u8(filter, 5);
1987bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 2 bytes of context, 4 bytes of src values, 3 bytes of context, 7 bytes of
2007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // garbage. So much effort for that last single bit.
2017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // The low values of each pair are for filter0.
2027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s0 = vld1q_u8(src_ptr);
2037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s1 = vld1q_u8(src_ptr);
2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s2 = vld1q_u8(src_ptr);
2077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2087bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s3 = vld1q_u8(src_ptr);
2097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Shift to extract values for filter[5]
2127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // If src[] is 0, this puts:
2137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 3 4 5 6 7 8 9 10 in s0_f5
2147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Can't use vshr.u64 because it crosses the double word boundary.
2157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
2167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
2177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
2187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
2197bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
2227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
2247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
2257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
2267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
2277bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Keep original src data as 64 bits to simplify shifting and extracting.
2297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01 = vreinterpretq_u64_u8(s01_f0);
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23 = vreinterpretq_u64_u8(s23_f0);
2317bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 3 4 5 6 * filter0
2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
2347bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Shift over one to use -1, 0, 1, 2 for filter1
2367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // -1 0 1 2 * filter1
2377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
2387bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
2397bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
2407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 2 3 4 5 * filter4
2427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
2437bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
2447bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
2457bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 0 1 2 3 * filter2
2477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
2487bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
2507bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 1 2 3 4 * filter3
2527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f3 = vshrq_n_u64(s01, 24);
2537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f3 = vshrq_n_u64(s23, 24);
2547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
2557bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
2567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
2577bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
2587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Accumulate into different registers so it can use saturated addition.
2597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
2607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
2617bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e0123 =
2637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
2647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e4567 =
2657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
2667bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Shift and narrow.
2687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b0 = vqrshrun_n_s16(e0123, 7);
2697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b2 = vqrshrun_n_s16(e4567, 7);
2707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0) {  // firstpass_filter4x4_only
2720a39d0a697ff3603e8c100300fda363658e10b23James Zern    store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
2737bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
2747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
2757bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Load additional context when doing both filters.
2777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s0 = vld1q_u8(src_ptr);
2787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s1 = vld1q_u8(src_ptr);
2807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s2 = vld1q_u8(src_ptr);
2827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s3 = vld1q_u8(src_ptr);
2847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src_ptr += src_pixels_per_line;
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4 = vld1q_u8(src_ptr);
2867bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s0_f5 = vext_u8(vget_low_u8(s0), vget_high_u8(s0), 5);
2887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s1_f5 = vext_u8(vget_low_u8(s1), vget_high_u8(s1), 5);
2897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s2_f5 = vext_u8(vget_low_u8(s2), vget_high_u8(s2), 5);
2907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s3_f5 = vext_u8(vget_low_u8(s3), vget_high_u8(s3), 5);
2917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4_f5 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 5);
2927bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // 3 4 5 6 * filter0
2947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f0 = vcombine_u8(vget_low_u8(s0), vget_low_u8(s1));
2957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f0 = vcombine_u8(vget_low_u8(s2), vget_low_u8(s3));
2967bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f5_q = vzip_u32(vreinterpret_u32_u8(s0_f5), vreinterpret_u32_u8(s1_f5));
2987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f5_q = vzip_u32(vreinterpret_u32_u8(s2_f5), vreinterpret_u32_u8(s3_f5));
2997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // But this time instead of 16 pixels to filter, there are 20. So an extra
3007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // run with a doubleword register.
3017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0123 = vmull_u8(vreinterpret_u8_u32(s01_f5_q.val[0]), filter5);
3027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4567 = vmull_u8(vreinterpret_u8_u32(s23_f5_q.val[0]), filter5);
3037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89 = vmull_u8(s4_f5, filter5);
3047bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3057bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Save a copy as u64 for shifting.
3067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01 = vreinterpretq_u64_u8(s01_f0);
3077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23 = vreinterpretq_u64_u8(s23_f0);
3087bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_add_accumulate(s01_f0, s23_f0, filter0, &d0123, &d4567);
3107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89 = vmlal_u8(d89, vget_low_u8(s4), filter0);
3117bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 8)),
3137bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 8)), filter1,
3147bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
3157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4_f1 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 1);
3167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89 = vmlsl_u8(d89, s4_f1, filter1);
3177bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_sub_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 32)),
3197bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 32)), filter4,
3207bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
3217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4_f4 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 4);
3227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89 = vmlsl_u8(d89, s4_f4, filter4);
3237bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter_add_accumulate(vreinterpretq_u8_u64(vshrq_n_u64(s01, 16)),
3257bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        vreinterpretq_u8_u64(vshrq_n_u64(s23, 16)), filter2,
3267bc9febe8749e98a3812a0dc4380ceae75c29450Johann                        &d0123, &d4567);
3277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4_f2 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 2);
3287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89 = vmlal_u8(d89, s4_f2, filter2);
3297bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f3 = vshrq_n_u64(s01, 24);
3317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f3 = vshrq_n_u64(s23, 24);
3327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s01_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s01_f3)),
3337bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      vreinterpret_u32_u64(vget_high_u64(s01_f3)));
3347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s23_f3_q = vzip_u32(vreinterpret_u32_u64(vget_low_u64(s23_f3)),
3357bc9febe8749e98a3812a0dc4380ceae75c29450Johann                      vreinterpret_u32_u64(vget_high_u64(s23_f3)));
3367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s4_f3 = vext_u8(vget_low_u8(s4), vget_high_u8(s4), 3);
3377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0123_a = vmull_u8(vreinterpret_u8_u32(s01_f3_q.val[0]), filter3);
3387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4567_a = vmull_u8(vreinterpret_u8_u32(s23_f3_q.val[0]), filter3);
3397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d89_a = vmull_u8(s4_f3, filter3);
3407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e0123 =
3427bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vqaddq_s16(vreinterpretq_s16_u16(d0123), vreinterpretq_s16_u16(d0123_a));
3437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e4567 =
3447bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vqaddq_s16(vreinterpretq_s16_u16(d4567), vreinterpretq_s16_u16(d4567_a));
3457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e89 = vqaddq_s16(vreinterpretq_s16_u16(d89), vreinterpretq_s16_u16(d89_a));
3467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b4 = vqrshrun_n_s16(e0123, 7);
3487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b6 = vqrshrun_n_s16(e4567, 7);
3497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b8 = vqrshrun_n_s16(e89, 7);
3507bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Second pass: 4x4
3527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter = vld1_u8(abs_filters[yoffset]);
3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter0 = vdup_lane_u8(filter, 0);
3547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter1 = vdup_lane_u8(filter, 1);
3557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter2 = vdup_lane_u8(filter, 2);
3567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter3 = vdup_lane_u8(filter, 3);
3577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter4 = vdup_lane_u8(filter, 4);
3587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  filter5 = vdup_lane_u8(filter, 5);
3597bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b1 = vext_u8(b0, b2, 4);
3617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b3 = vext_u8(b2, b4, 4);
3627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b5 = vext_u8(b4, b6, 4);
3637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  b7 = vext_u8(b6, b8, 4);
3647bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmull_u8(b0, filter0);
3667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmull_u8(b2, filter0);
3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmull_u8(b5, filter5);
3687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmull_u8(b7, filter5);
3697bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmlsl_u8(c0, b4, filter4);
3717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmlsl_u8(c1, b6, filter4);
3727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmlsl_u8(c2, b1, filter1);
3737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmlsl_u8(c3, b3, filter1);
3747bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c0 = vmlal_u8(c0, b2, filter2);
3767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c1 = vmlal_u8(c1, b4, filter2);
3777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c2 = vmlal_u8(c2, b3, filter3);
3787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  c3 = vmlal_u8(c3, b5, filter3);
3797bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0 = vqaddq_s16(vreinterpretq_s16_u16(c2), vreinterpretq_s16_u16(c0));
3817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1 = vqaddq_s16(vreinterpretq_s16_u16(c3), vreinterpretq_s16_u16(c1));
3827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e0 = vqrshrun_n_s16(d0, 7);
3847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  e1 = vqrshrun_n_s16(d1, 7);
3857bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3860a39d0a697ff3603e8c100300fda363658e10b23James Zern  store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
3877bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
3887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
3897bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
3907bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                int xoffset, int yoffset,
3917bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                unsigned char *dst_ptr, int dst_pitch) {
3927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  unsigned char *src;
3937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
3947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d22u8, d23u8, d24u8, d25u8, d26u8;
3957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d27u8, d28u8, d29u8, d30u8, d31u8;
3967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
3977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
3987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
3997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
4007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
4017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8;
4027bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (xoffset == 0) {  // secondpass_filter8x4_only
4047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load second_pass filter
4057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
406ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0s8 = vdup_lane_s8(dtmps8, 0);
407ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1s8 = vdup_lane_s8(dtmps8, 1);
408ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2s8 = vdup_lane_s8(dtmps8, 2);
409ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3s8 = vdup_lane_s8(dtmps8, 3);
410ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4s8 = vdup_lane_s8(dtmps8, 4);
411ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5s8 = vdup_lane_s8(dtmps8, 5);
412ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
413ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
414ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
415ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
416ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
417ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
418ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load src data
4207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - src_pixels_per_line * 2;
4217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d22u8 = vld1_u8(src);
422ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d23u8 = vld1_u8(src);
424ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d24u8 = vld1_u8(src);
426ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d25u8 = vld1_u8(src);
428ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d26u8 = vld1_u8(src);
430ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d27u8 = vld1_u8(src);
432ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vld1_u8(src);
434ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vld1_u8(src);
436ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
4377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vld1_u8(src);
438ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
439ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u16 = vmull_u8(d22u8, d0u8);
440ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u16 = vmull_u8(d23u8, d0u8);
441ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u16 = vmull_u8(d24u8, d0u8);
442ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u16 = vmull_u8(d25u8, d0u8);
443ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
444ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
445ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
446ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
447ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
448ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
449ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
450ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
451ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
452ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
453ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
454ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
455ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
456ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
457ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
458ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
459ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
460ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
461ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
462ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
463ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
464ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q7u16 = vmull_u8(d25u8, d3u8);
465ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q8u16 = vmull_u8(d26u8, d3u8);
466ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q9u16 = vmull_u8(d27u8, d3u8);
467ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q10u16 = vmull_u8(d28u8, d3u8);
468ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
469ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3s16 = vreinterpretq_s16_u16(q3u16);
470ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4s16 = vreinterpretq_s16_u16(q4u16);
471ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5s16 = vreinterpretq_s16_u16(q5u16);
472ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6s16 = vreinterpretq_s16_u16(q6u16);
473ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q7s16 = vreinterpretq_s16_u16(q7u16);
474ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q8s16 = vreinterpretq_s16_u16(q8u16);
475ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q9s16 = vreinterpretq_s16_u16(q9u16);
476ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q10s16 = vreinterpretq_s16_u16(q10u16);
477ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
478ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q7s16 = vqaddq_s16(q7s16, q3s16);
479ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q8s16 = vqaddq_s16(q8s16, q4s16);
480ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q9s16 = vqaddq_s16(q9s16, q5s16);
481ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q10s16 = vqaddq_s16(q10s16, q6s16);
482ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
483ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d6u8 = vqrshrun_n_s16(q7s16, 7);
484ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d7u8 = vqrshrun_n_s16(q8s16, 7);
485ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d8u8 = vqrshrun_n_s16(q9s16, 7);
486ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d9u8 = vqrshrun_n_s16(q10s16, 7);
487ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
488ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vst1_u8(dst_ptr, d6u8);
489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dst_ptr += dst_pitch;
490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vst1_u8(dst_ptr, d7u8);
491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dst_ptr += dst_pitch;
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vst1_u8(dst_ptr, d8u8);
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dst_ptr += dst_pitch;
494ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    vst1_u8(dst_ptr, d9u8);
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    return;
4967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
4977bc9febe8749e98a3812a0dc4380ceae75c29450Johann
4987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // load first_pass filter
4997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
5007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
5027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
5037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
5047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
5057bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
5077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
5087bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
5097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
5107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
5127bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // First pass: output_height lines x output_width columns (9x4)
5147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0)  // firstpass_filter4x4_only
5157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - 2;
5167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  else
5177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - 2 - (src_pixels_per_line * 2);
5187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u8 = vld1q_u8(src);
5197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
5207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u8 = vld1q_u8(src);
5217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
5227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u8 = vld1q_u8(src);
5237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
5247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = vld1q_u8(src);
5257bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
5277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
5287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
5297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
5337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
5347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
5357bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
5377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
5387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
5397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
5407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
5427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
5437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
5447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
5457bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
5477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
5487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
5497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
5507bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
5527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
5537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
5547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
5557bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
5577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
5587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
5597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
5627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
5637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
5647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
5677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
5687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
5697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
5727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
5737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
5747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
5757bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmull_u8(d28u8, d3u8);
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmull_u8(d29u8, d3u8);
5787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmull_u8(d30u8, d3u8);
5797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmull_u8(d31u8, d3u8);
5807bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vreinterpretq_s16_u16(q3u16);
5827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s16 = vreinterpretq_s16_u16(q4u16);
5837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5s16 = vreinterpretq_s16_u16(q5u16);
5847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6s16 = vreinterpretq_s16_u16(q6u16);
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vreinterpretq_s16_u16(q7u16);
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vreinterpretq_s16_u16(q8u16);
5877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vreinterpretq_s16_u16(q9u16);
5887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vreinterpretq_s16_u16(q10u16);
5897bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vqaddq_s16(q7s16, q3s16);
5917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vqaddq_s16(q8s16, q4s16);
5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vqaddq_s16(q9s16, q5s16);
5937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vqaddq_s16(q10s16, q6s16);
5947bc9febe8749e98a3812a0dc4380ceae75c29450Johann
5957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d22u8 = vqrshrun_n_s16(q7s16, 7);
5967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d23u8 = vqrshrun_n_s16(q8s16, 7);
5977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d24u8 = vqrshrun_n_s16(q9s16, 7);
5987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d25u8 = vqrshrun_n_s16(q10s16, 7);
5997bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0) {  // firstpass_filter8x4_only
6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d22u8);
6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
6037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d23u8);
6047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
6057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d24u8);
6067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
6077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d25u8);
6087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
6097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
6107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // First Pass on rest 5-line data
6127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u8 = vld1q_u8(src);
6147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
6157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u8 = vld1q_u8(src);
6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u8 = vld1q_u8(src);
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
6197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = vld1q_u8(src);
6207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u8 = vld1q_u8(src);
6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
6247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
6257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
6267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
6277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
6287bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
6307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
6317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
6327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
6337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
6347bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
6367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
6377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
6387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
6397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
6407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
6427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
6437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
6447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
6457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
6467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
6487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
6497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
6507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
6517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
6527bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
6547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
6557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
6567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
6577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
6587bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
6607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
6617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
6627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
6637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
6647bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
6667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
6677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
6687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
6697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
6707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
6727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
6737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
6747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
6757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
6767bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
6787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
6797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
6807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
6817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
6827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmull_u8(d27u8, d3u8);
6847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmull_u8(d28u8, d3u8);
6857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmull_u8(d29u8, d3u8);
6867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmull_u8(d30u8, d3u8);
6877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmull_u8(d31u8, d3u8);
6887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vreinterpretq_s16_u16(q3u16);
6907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s16 = vreinterpretq_s16_u16(q4u16);
6917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5s16 = vreinterpretq_s16_u16(q5u16);
6927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6s16 = vreinterpretq_s16_u16(q6u16);
6937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vreinterpretq_s16_u16(q7u16);
6947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vreinterpretq_s16_u16(q8u16);
6957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vreinterpretq_s16_u16(q9u16);
6967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vreinterpretq_s16_u16(q10u16);
6977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11s16 = vreinterpretq_s16_u16(q11u16);
6987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12s16 = vreinterpretq_s16_u16(q12u16);
6997bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vqaddq_s16(q8s16, q3s16);
7017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vqaddq_s16(q9s16, q4s16);
7027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vqaddq_s16(q10s16, q5s16);
7037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11s16 = vqaddq_s16(q11s16, q6s16);
7047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12s16 = vqaddq_s16(q12s16, q7s16);
7057bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d26u8 = vqrshrun_n_s16(q8s16, 7);
7077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vqrshrun_n_s16(q9s16, 7);
7087bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vqrshrun_n_s16(q10s16, 7);
7097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vqrshrun_n_s16(q11s16, 7);
7107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vqrshrun_n_s16(q12s16, 7);
7117bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Second pass: 8x4
7137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
7147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
7157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
7167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
7177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
7187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
7197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
7207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
7217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
7227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
7237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
7247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
7257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
7267bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmull_u8(d22u8, d0u8);
7287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmull_u8(d23u8, d0u8);
7297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmull_u8(d24u8, d0u8);
7307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmull_u8(d25u8, d0u8);
7317bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmlsl_u8(q3u16, d23u8, d1u8);
7337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmlsl_u8(q4u16, d24u8, d1u8);
7347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmlsl_u8(q5u16, d25u8, d1u8);
7357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmlsl_u8(q6u16, d26u8, d1u8);
7367bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmlsl_u8(q3u16, d26u8, d4u8);
7387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmlsl_u8(q4u16, d27u8, d4u8);
7397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmlsl_u8(q5u16, d28u8, d4u8);
7407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmlsl_u8(q6u16, d29u8, d4u8);
7417bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmlal_u8(q3u16, d24u8, d2u8);
7437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmlal_u8(q4u16, d25u8, d2u8);
7447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmlal_u8(q5u16, d26u8, d2u8);
7457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmlal_u8(q6u16, d27u8, d2u8);
7467bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmlal_u8(q3u16, d27u8, d5u8);
7487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmlal_u8(q4u16, d28u8, d5u8);
7497bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmlal_u8(q5u16, d29u8, d5u8);
7507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmlal_u8(q6u16, d30u8, d5u8);
7517bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmull_u8(d25u8, d3u8);
7537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmull_u8(d26u8, d3u8);
7547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmull_u8(d27u8, d3u8);
7557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmull_u8(d28u8, d3u8);
7567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vreinterpretq_s16_u16(q3u16);
7587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s16 = vreinterpretq_s16_u16(q4u16);
7597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5s16 = vreinterpretq_s16_u16(q5u16);
7607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6s16 = vreinterpretq_s16_u16(q6u16);
7617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vreinterpretq_s16_u16(q7u16);
7627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vreinterpretq_s16_u16(q8u16);
7637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vreinterpretq_s16_u16(q9u16);
7647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vreinterpretq_s16_u16(q10u16);
7657bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vqaddq_s16(q7s16, q3s16);
7677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vqaddq_s16(q8s16, q4s16);
7687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vqaddq_s16(q9s16, q5s16);
7697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vqaddq_s16(q10s16, q6s16);
7707bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d6u8 = vqrshrun_n_s16(q7s16, 7);
7727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d7u8 = vqrshrun_n_s16(q8s16, 7);
7737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d8u8 = vqrshrun_n_s16(q9s16, 7);
7747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d9u8 = vqrshrun_n_s16(q10s16, 7);
7757bc9febe8749e98a3812a0dc4380ceae75c29450Johann
7767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1_u8(dst_ptr, d6u8);
7777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dst_ptr += dst_pitch;
7787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1_u8(dst_ptr, d7u8);
7797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dst_ptr += dst_pitch;
7807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1_u8(dst_ptr, d8u8);
7817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dst_ptr += dst_pitch;
7827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1_u8(dst_ptr, d9u8);
7837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
7867bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
7877bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                int xoffset, int yoffset,
7887bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                unsigned char *dst_ptr, int dst_pitch) {
7897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  unsigned char *src, *tmpp;
7907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  unsigned char tmp[64];
7917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int i;
7927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
7937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d18u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8, d25u8;
7947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
7957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
7967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16;
7977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q8u16, q9u16, q10u16, q11u16, q12u16;
7987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16;
7997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16;
8007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q9u8, q10u8, q11u8, q12u8;
8017bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (xoffset == 0) {  // secondpass_filter8x8_only
8037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load second_pass filter
8047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
805ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0s8 = vdup_lane_s8(dtmps8, 0);
806ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1s8 = vdup_lane_s8(dtmps8, 1);
807ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2s8 = vdup_lane_s8(dtmps8, 2);
808ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3s8 = vdup_lane_s8(dtmps8, 3);
809ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4s8 = vdup_lane_s8(dtmps8, 4);
810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5s8 = vdup_lane_s8(dtmps8, 5);
811ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
812ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
813ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
814ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
815ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
816ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
817ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load src data
8197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - src_pixels_per_line * 2;
8207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d18u8 = vld1_u8(src);
8217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d19u8 = vld1_u8(src);
8237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d20u8 = vld1_u8(src);
8257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d21u8 = vld1_u8(src);
8277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d22u8 = vld1_u8(src);
8297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d23u8 = vld1_u8(src);
8317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8327bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d24u8 = vld1_u8(src);
8337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d25u8 = vld1_u8(src);
8357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d26u8 = vld1_u8(src);
8377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d27u8 = vld1_u8(src);
8397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vld1_u8(src);
8417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8427bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vld1_u8(src);
8437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
8447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vld1_u8(src);
845ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
846ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (i = 2; i > 0; i--) {
8477bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmull_u8(d18u8, d0u8);
8487bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmull_u8(d19u8, d0u8);
8497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmull_u8(d20u8, d0u8);
8507bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmull_u8(d21u8, d0u8);
8517bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
8537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
8547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
8557bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
8567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
8587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
8597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
8607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
8617bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
8637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
8647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
8657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
8667bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8677bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
8687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
8697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
8707bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
8717bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8727bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmull_u8(d21u8, d3u8);
8737bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmull_u8(d22u8, d3u8);
8747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmull_u8(d23u8, d3u8);
8757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10u16 = vmull_u8(d24u8, d3u8);
8767bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8777bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3s16 = vreinterpretq_s16_u16(q3u16);
8787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4s16 = vreinterpretq_s16_u16(q4u16);
8797bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5s16 = vreinterpretq_s16_u16(q5u16);
8807bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6s16 = vreinterpretq_s16_u16(q6u16);
8817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vreinterpretq_s16_u16(q7u16);
8827bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vreinterpretq_s16_u16(q8u16);
8837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vreinterpretq_s16_u16(q9u16);
8847bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10s16 = vreinterpretq_s16_u16(q10u16);
8857bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vqaddq_s16(q7s16, q3s16);
8877bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vqaddq_s16(q8s16, q4s16);
8887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vqaddq_s16(q9s16, q5s16);
8897bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10s16 = vqaddq_s16(q10s16, q6s16);
8907bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8917bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d6u8 = vqrshrun_n_s16(q7s16, 7);
8927bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d7u8 = vqrshrun_n_s16(q8s16, 7);
8937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d8u8 = vqrshrun_n_s16(q9s16, 7);
8947bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d9u8 = vqrshrun_n_s16(q10s16, 7);
8957bc9febe8749e98a3812a0dc4380ceae75c29450Johann
8967bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d18u8 = d22u8;
8977bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d19u8 = d23u8;
8987bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d20u8 = d24u8;
8997bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d21u8 = d25u8;
9007bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d22u8 = d26u8;
9017bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d23u8 = d27u8;
9027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d24u8 = d28u8;
9037bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d25u8 = d29u8;
9047bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d26u8 = d30u8;
9057bc9febe8749e98a3812a0dc4380ceae75c29450Johann
9067bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d6u8);
9077bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
9087bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d7u8);
9097bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
9107bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d8u8);
9117bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
9127bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d9u8);
9137bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
9157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
9167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
9177bc9febe8749e98a3812a0dc4380ceae75c29450Johann
9187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // load first_pass filter
9197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
9207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
9217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
9227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
9237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
9247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
9257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
9267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
9277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
9287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
9297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
9307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
9317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
9327bc9febe8749e98a3812a0dc4380ceae75c29450Johann
9337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // First pass: output_height lines x output_width columns (9x4)
9347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0)  // firstpass_filter4x4_only
9357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - 2;
9367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  else
9377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - 2 - (src_pixels_per_line * 2);
9387bc9febe8749e98a3812a0dc4380ceae75c29450Johann
9397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp = tmp;
9407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  for (i = 2; i > 0; i--) {
941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3u8 = vld1q_u8(src);
942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4u8 = vld1q_u8(src);
944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5u8 = vld1q_u8(src);
946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
947ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6u8 = vld1q_u8(src);
948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    src += src_pixels_per_line;
949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src);
9517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src + src_pixels_per_line);
9527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src + src_pixels_per_line * 2);
953ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9547bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
9557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
9567bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
9577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
9607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
9617bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
9627bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
963ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmlsl_u8(q7u16, d28u8, d1u8);
9657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlsl_u8(q8u16, d29u8, d1u8);
9667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlsl_u8(q9u16, d30u8, d1u8);
9677bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlsl_u8(q10u16, d31u8, d1u8);
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9697bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
9707bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
9717bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
9727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmlsl_u8(q7u16, d28u8, d4u8);
9757bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlsl_u8(q8u16, d29u8, d4u8);
9767bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlsl_u8(q9u16, d30u8, d4u8);
9777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlsl_u8(q10u16, d31u8, d4u8);
978ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
9807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
9817bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
9827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmlal_u8(q7u16, d28u8, d2u8);
9857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlal_u8(q8u16, d29u8, d2u8);
9867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlal_u8(q9u16, d30u8, d2u8);
9877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlal_u8(q10u16, d31u8, d2u8);
988ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
9907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
9917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
9927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmlal_u8(q7u16, d28u8, d5u8);
9957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
9967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlal_u8(q9u16, d30u8, d5u8);
9977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlal_u8(q10u16, d31u8, d5u8);
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
9997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
10007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
10017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
10027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
10037bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmull_u8(d28u8, d3u8);
10057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmull_u8(d29u8, d3u8);
10067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmull_u8(d30u8, d3u8);
10077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmull_u8(d31u8, d3u8);
1008ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q3s16 = vreinterpretq_s16_u16(q3u16);
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q4s16 = vreinterpretq_s16_u16(q4u16);
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q5s16 = vreinterpretq_s16_u16(q5u16);
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q6s16 = vreinterpretq_s16_u16(q6u16);
1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q7s16 = vreinterpretq_s16_u16(q7u16);
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q8s16 = vreinterpretq_s16_u16(q8u16);
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q9s16 = vreinterpretq_s16_u16(q9u16);
1016ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    q10s16 = vreinterpretq_s16_u16(q10u16);
1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
10187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7s16 = vqaddq_s16(q7s16, q3s16);
10197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8s16 = vqaddq_s16(q8s16, q4s16);
10207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9s16 = vqaddq_s16(q9s16, q5s16);
10217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10s16 = vqaddq_s16(q10s16, q6s16);
10227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d22u8 = vqrshrun_n_s16(q7s16, 7);
10247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d23u8 = vqrshrun_n_s16(q8s16, 7);
10257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d24u8 = vqrshrun_n_s16(q9s16, 7);
10267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d25u8 = vqrshrun_n_s16(q10s16, 7);
10277bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    if (yoffset == 0) {  // firstpass_filter8x4_only
10297bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d22u8);
10307bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
10317bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d23u8);
10327bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
10337bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d24u8);
10347bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
10357bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst_ptr, d25u8);
10367bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst_ptr += dst_pitch;
10377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    } else {
10387bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(tmpp, d22u8);
10397bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 8;
10407bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(tmpp, d23u8);
10417bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 8;
10427bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(tmpp, d24u8);
10437bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 8;
10447bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(tmpp, d25u8);
10457bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 8;
10467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    }
10477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
10487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0) return;
10497bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // First Pass on rest 5-line data
10517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u8 = vld1q_u8(src);
10527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
10537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u8 = vld1q_u8(src);
10547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
10557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u8 = vld1q_u8(src);
10567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
10577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = vld1q_u8(src);
10587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src += src_pixels_per_line;
10597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u8 = vld1q_u8(src);
10607bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmull_u8(vget_low_u8(q3u8), d0u8);
10627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmull_u8(vget_low_u8(q4u8), d0u8);
10637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmull_u8(vget_low_u8(q5u8), d0u8);
10647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmull_u8(vget_low_u8(q6u8), d0u8);
10657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmull_u8(vget_low_u8(q7u8), d0u8);
10667bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 1);
10687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 1);
10697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 1);
10707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 1);
10717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 1);
10727bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d27u8, d1u8);
10747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
10757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
10767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlsl_u8(q11u16, d30u8, d1u8);
10777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlsl_u8(q12u16, d31u8, d1u8);
10787bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 4);
10807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 4);
10817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 4);
10827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 4);
10837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 4);
10847bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlsl_u8(q8u16, d27u8, d4u8);
10867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
10877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
10887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlsl_u8(q11u16, d30u8, d4u8);
10897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlsl_u8(q12u16, d31u8, d4u8);
10907bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 2);
10927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 2);
10937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 2);
10947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 2);
10957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 2);
10967bc9febe8749e98a3812a0dc4380ceae75c29450Johann
10977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d27u8, d2u8);
10987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
10997bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
11007bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlal_u8(q11u16, d30u8, d2u8);
11017bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlal_u8(q12u16, d31u8, d2u8);
11027bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 5);
11047bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 5);
11057bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 5);
11067bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 5);
11077bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 5);
11087bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11097bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u16 = vmlal_u8(q8u16, d27u8, d5u8);
11107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
11117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
11127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u16 = vmlal_u8(q11u16, d30u8, d5u8);
11137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u16 = vmlal_u8(q12u16, d31u8, d5u8);
11147bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vext_u8(vget_low_u8(q3u8), vget_high_u8(q3u8), 3);
11167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vext_u8(vget_low_u8(q4u8), vget_high_u8(q4u8), 3);
11177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vext_u8(vget_low_u8(q5u8), vget_high_u8(q5u8), 3);
11187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vext_u8(vget_low_u8(q6u8), vget_high_u8(q6u8), 3);
11197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d31u8 = vext_u8(vget_low_u8(q7u8), vget_high_u8(q7u8), 3);
11207bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3u16 = vmull_u8(d27u8, d3u8);
11227bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4u16 = vmull_u8(d28u8, d3u8);
11237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u16 = vmull_u8(d29u8, d3u8);
11247bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u16 = vmull_u8(d30u8, d3u8);
11257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u16 = vmull_u8(d31u8, d3u8);
11267bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vreinterpretq_s16_u16(q3u16);
11287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s16 = vreinterpretq_s16_u16(q4u16);
11297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5s16 = vreinterpretq_s16_u16(q5u16);
11307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6s16 = vreinterpretq_s16_u16(q6u16);
11317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7s16 = vreinterpretq_s16_u16(q7u16);
11327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vreinterpretq_s16_u16(q8u16);
11337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vreinterpretq_s16_u16(q9u16);
11347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vreinterpretq_s16_u16(q10u16);
11357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11s16 = vreinterpretq_s16_u16(q11u16);
11367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12s16 = vreinterpretq_s16_u16(q12u16);
11377bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8s16 = vqaddq_s16(q8s16, q3s16);
11397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9s16 = vqaddq_s16(q9s16, q4s16);
11407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s16 = vqaddq_s16(q10s16, q5s16);
11417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11s16 = vqaddq_s16(q11s16, q6s16);
11427bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12s16 = vqaddq_s16(q12s16, q7s16);
11437bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d26u8 = vqrshrun_n_s16(q8s16, 7);
11457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d27u8 = vqrshrun_n_s16(q9s16, 7);
11467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d28u8 = vqrshrun_n_s16(q10s16, 7);
11477bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d29u8 = vqrshrun_n_s16(q11s16, 7);
11487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d30u8 = vqrshrun_n_s16(q12s16, 7);
11497bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Second pass: 8x8
11517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
11527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
11537bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
11547bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
11557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
11567bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
11577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
11587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
11597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
11607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
11617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
11627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
11637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
11647bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp = tmp;
11667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u8 = vld1q_u8(tmpp);
11677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp += 16;
11687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u8 = vld1q_u8(tmpp);
11697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp += 16;
11707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11u8 = vld1q_u8(tmpp);
11717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp += 16;
11727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q12u8 = vld1q_u8(tmpp);
11737bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d18u8 = vget_low_u8(q9u8);
11757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d19u8 = vget_high_u8(q9u8);
11767bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d20u8 = vget_low_u8(q10u8);
11777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d21u8 = vget_high_u8(q10u8);
11787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d22u8 = vget_low_u8(q11u8);
11797bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d23u8 = vget_high_u8(q11u8);
11807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d24u8 = vget_low_u8(q12u8);
11817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d25u8 = vget_high_u8(q12u8);
11827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  for (i = 2; i > 0; i--) {
11847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmull_u8(d18u8, d0u8);
11857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmull_u8(d19u8, d0u8);
11867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmull_u8(d20u8, d0u8);
11877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmull_u8(d21u8, d0u8);
11887bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
11907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
11917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
11927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
11937bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
11957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
11967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
11977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
11987bc9febe8749e98a3812a0dc4380ceae75c29450Johann
11997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
12007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
12017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
12027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
12037bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
12057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
12067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
12077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
12087bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmull_u8(d21u8, d3u8);
12107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmull_u8(d22u8, d3u8);
12117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmull_u8(d23u8, d3u8);
12127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmull_u8(d24u8, d3u8);
1213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3s16 = vreinterpretq_s16_u16(q3u16);
12157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4s16 = vreinterpretq_s16_u16(q4u16);
12167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5s16 = vreinterpretq_s16_u16(q5u16);
12177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6s16 = vreinterpretq_s16_u16(q6u16);
12187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7s16 = vreinterpretq_s16_u16(q7u16);
12197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8s16 = vreinterpretq_s16_u16(q8u16);
12207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9s16 = vreinterpretq_s16_u16(q9u16);
12217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10s16 = vreinterpretq_s16_u16(q10u16);
1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7s16 = vqaddq_s16(q7s16, q3s16);
12247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8s16 = vqaddq_s16(q8s16, q4s16);
12257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9s16 = vqaddq_s16(q9s16, q5s16);
12267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10s16 = vqaddq_s16(q10s16, q6s16);
12277bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d6u8 = vqrshrun_n_s16(q7s16, 7);
12297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d7u8 = vqrshrun_n_s16(q8s16, 7);
12307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d8u8 = vqrshrun_n_s16(q9s16, 7);
12317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d9u8 = vqrshrun_n_s16(q10s16, 7);
12327bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d18u8 = d22u8;
12347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d19u8 = d23u8;
12357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d20u8 = d24u8;
12367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d21u8 = d25u8;
12377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d22u8 = d26u8;
12387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d23u8 = d27u8;
12397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d24u8 = d28u8;
12407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d25u8 = d29u8;
12417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d26u8 = d30u8;
12427bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d6u8);
12447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
12457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d7u8);
12467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
12477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d8u8);
12487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
12497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(dst_ptr, d9u8);
12507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst_ptr += dst_pitch;
12517bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
12527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
12537bc9febe8749e98a3812a0dc4380ceae75c29450Johann}
12547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12557bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
12567bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  int src_pixels_per_line, int xoffset,
12577bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  int yoffset, unsigned char *dst_ptr,
12587bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  int dst_pitch) {
12597bc9febe8749e98a3812a0dc4380ceae75c29450Johann  unsigned char *src, *src_tmp, *dst, *tmpp;
12607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  unsigned char tmp[336];
12617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int i, j;
12627bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
12637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d18u8, d19u8;
12647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d20u8, d21u8, d22u8, d23u8, d24u8, d25u8, d26u8, d27u8;
12657bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x8_t d28u8, d29u8, d30u8, d31u8;
12667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int8x8_t dtmps8, d0s8, d1s8, d2s8, d3s8, d4s8, d5s8;
12677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t q3u8, q4u8;
12687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q3u16, q4u16, q5u16, q6u16, q7u16, q8u16, q9u16, q10u16;
12697bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint16x8_t q11u16, q12u16, q13u16, q15u16;
12707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q3s16, q4s16, q5s16, q6s16, q7s16, q8s16, q9s16, q10s16;
12717bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q11s16, q12s16, q13s16, q15s16;
12727bc9febe8749e98a3812a0dc4380ceae75c29450Johann
12737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (xoffset == 0) {  // secondpass_filter8x8_only
12747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load second_pass filter
1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0s8 = vdup_lane_s8(dtmps8, 0);
1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1s8 = vdup_lane_s8(dtmps8, 1);
1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2s8 = vdup_lane_s8(dtmps8, 2);
1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3s8 = vdup_lane_s8(dtmps8, 3);
1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4s8 = vdup_lane_s8(dtmps8, 4);
1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5s8 = vdup_lane_s8(dtmps8, 5);
1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
1284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    // load src data
12907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src_tmp = src_ptr - src_pixels_per_line * 2;
12917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    for (i = 0; i < 2; ++i) {
12927bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src = src_tmp + i * 8;
12937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst = dst_ptr + i * 8;
12947bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d18u8 = vld1_u8(src);
12957bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
12967bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d19u8 = vld1_u8(src);
12977bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
12987bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d20u8 = vld1_u8(src);
12997bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
13007bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d21u8 = vld1_u8(src);
13017bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
13027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d22u8 = vld1_u8(src);
13037bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
13047bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (j = 0; j < 4; ++j) {
13057bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d23u8 = vld1_u8(src);
13067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_pixels_per_line;
13077bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d24u8 = vld1_u8(src);
13087bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_pixels_per_line;
13097bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d25u8 = vld1_u8(src);
13107bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_pixels_per_line;
13117bc9febe8749e98a3812a0dc4380ceae75c29450Johann        d26u8 = vld1_u8(src);
13127bc9febe8749e98a3812a0dc4380ceae75c29450Johann        src += src_pixels_per_line;
1313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3u16 = vmull_u8(d18u8, d0u8);
1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4u16 = vmull_u8(d19u8, d0u8);
1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5u16 = vmull_u8(d20u8, d0u8);
1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6u16 = vmull_u8(d21u8, d0u8);
1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
1320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
1321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
1326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
1327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
1332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
1333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q7u16 = vmull_u8(d21u8, d3u8);
1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q8u16 = vmull_u8(d22u8, d3u8);
1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q9u16 = vmull_u8(d23u8, d3u8);
1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q10u16 = vmull_u8(d24u8, d3u8);
1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q3s16 = vreinterpretq_s16_u16(q3u16);
1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q4s16 = vreinterpretq_s16_u16(q4u16);
1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q5s16 = vreinterpretq_s16_u16(q5u16);
1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q6s16 = vreinterpretq_s16_u16(q6u16);
1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q7s16 = vreinterpretq_s16_u16(q7u16);
1349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q8s16 = vreinterpretq_s16_u16(q8u16);
1350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q9s16 = vreinterpretq_s16_u16(q9u16);
1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q10s16 = vreinterpretq_s16_u16(q10u16);
1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q7s16 = vqaddq_s16(q7s16, q3s16);
1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q8s16 = vqaddq_s16(q8s16, q4s16);
1355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q9s16 = vqaddq_s16(q9s16, q5s16);
1356ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        q10s16 = vqaddq_s16(q10s16, q6s16);
1357ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1358ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d6u8 = vqrshrun_n_s16(q7s16, 7);
1359ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d7u8 = vqrshrun_n_s16(q8s16, 7);
1360ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d8u8 = vqrshrun_n_s16(q9s16, 7);
1361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d9u8 = vqrshrun_n_s16(q10s16, 7);
1362ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1363ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d18u8 = d22u8;
1364ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d19u8 = d23u8;
1365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d20u8 = d24u8;
1366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d21u8 = d25u8;
1367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        d22u8 = d26u8;
1368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
13697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u8(dst, d6u8);
13707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_pitch;
13717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u8(dst, d7u8);
13727bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_pitch;
13737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u8(dst, d8u8);
13747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_pitch;
13757bc9febe8749e98a3812a0dc4380ceae75c29450Johann        vst1_u8(dst, d9u8);
13767bc9febe8749e98a3812a0dc4380ceae75c29450Johann        dst += dst_pitch;
13777bc9febe8749e98a3812a0dc4380ceae75c29450Johann      }
1378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
13797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
13807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
13817bc9febe8749e98a3812a0dc4380ceae75c29450Johann
13827bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // load first_pass filter
13837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[xoffset]);
13847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
13857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
13867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
13877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
13887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
13897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
13907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
13917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
13927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
13937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
13947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
13957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
13967bc9febe8749e98a3812a0dc4380ceae75c29450Johann
13977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // First pass: output_height lines x output_width columns (9x4)
13987bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (yoffset == 0) {  // firstpass_filter4x4_only
13997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src = src_ptr - 2;
14007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst = dst_ptr;
14017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    for (i = 0; i < 8; ++i) {
14027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d6u8 = vld1_u8(src);
14037bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d7u8 = vld1_u8(src + 8);
14047bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d8u8 = vld1_u8(src + 16);
14057bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
14067bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d9u8 = vld1_u8(src);
14077bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d10u8 = vld1_u8(src + 8);
14087bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d11u8 = vld1_u8(src + 16);
14097bc9febe8749e98a3812a0dc4380ceae75c29450Johann      src += src_pixels_per_line;
14107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14117bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src);
14127bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __builtin_prefetch(src + src_pixels_per_line);
14137bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14147bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmull_u8(d6u8, d0u8);
14157bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmull_u8(d7u8, d0u8);
14167bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmull_u8(d9u8, d0u8);
14177bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmull_u8(d10u8, d0u8);
14187bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14197bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d20u8 = vext_u8(d6u8, d7u8, 1);
14207bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d21u8 = vext_u8(d9u8, d10u8, 1);
14217bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d22u8 = vext_u8(d7u8, d8u8, 1);
14227bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d23u8 = vext_u8(d10u8, d11u8, 1);
14237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d24u8 = vext_u8(d6u8, d7u8, 4);
14247bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d25u8 = vext_u8(d9u8, d10u8, 4);
14257bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d26u8 = vext_u8(d7u8, d8u8, 4);
14267bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d27u8 = vext_u8(d10u8, d11u8, 4);
14277bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d28u8 = vext_u8(d6u8, d7u8, 5);
14287bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d29u8 = vext_u8(d9u8, d10u8, 5);
14297bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14307bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d20u8, d1u8);
14317bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmlsl_u8(q8u16, d21u8, d1u8);
14327bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmlsl_u8(q7u16, d22u8, d1u8);
14337bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmlsl_u8(q9u16, d23u8, d1u8);
14347bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d24u8, d4u8);
14357bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmlsl_u8(q8u16, d25u8, d4u8);
14367bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmlsl_u8(q7u16, d26u8, d4u8);
14377bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmlsl_u8(q9u16, d27u8, d4u8);
14387bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d28u8, d5u8);
14397bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmlal_u8(q8u16, d29u8, d5u8);
14407bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14417bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d20u8 = vext_u8(d7u8, d8u8, 5);
14427bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d21u8 = vext_u8(d10u8, d11u8, 5);
14437bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d22u8 = vext_u8(d6u8, d7u8, 2);
14447bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d23u8 = vext_u8(d9u8, d10u8, 2);
14457bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d24u8 = vext_u8(d7u8, d8u8, 2);
14467bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d25u8 = vext_u8(d10u8, d11u8, 2);
14477bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d26u8 = vext_u8(d6u8, d7u8, 3);
14487bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d27u8 = vext_u8(d9u8, d10u8, 3);
14497bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d28u8 = vext_u8(d7u8, d8u8, 3);
14507bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d29u8 = vext_u8(d10u8, d11u8, 3);
14517bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14527bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmlal_u8(q7u16, d20u8, d5u8);
14537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmlal_u8(q9u16, d21u8, d5u8);
14547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d22u8, d2u8);
14557bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmlal_u8(q8u16, d23u8, d2u8);
14567bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmlal_u8(q7u16, d24u8, d2u8);
14577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmlal_u8(q9u16, d25u8, d2u8);
14587bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10u16 = vmull_u8(d26u8, d3u8);
14607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q11u16 = vmull_u8(d27u8, d3u8);
14617bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q12u16 = vmull_u8(d28u8, d3u8);
14627bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q15u16 = vmull_u8(d29u8, d3u8);
14637bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6s16 = vreinterpretq_s16_u16(q6u16);
14657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vreinterpretq_s16_u16(q7u16);
14667bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vreinterpretq_s16_u16(q8u16);
14677bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vreinterpretq_s16_u16(q9u16);
14687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10s16 = vreinterpretq_s16_u16(q10u16);
14697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q11s16 = vreinterpretq_s16_u16(q11u16);
14707bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q12s16 = vreinterpretq_s16_u16(q12u16);
14717bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q15s16 = vreinterpretq_s16_u16(q15u16);
14727bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14737bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6s16 = vqaddq_s16(q6s16, q10s16);
14747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vqaddq_s16(q8s16, q11s16);
14757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vqaddq_s16(q7s16, q12s16);
14767bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vqaddq_s16(q9s16, q15s16);
14777bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d6u8 = vqrshrun_n_s16(q6s16, 7);
14797bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d7u8 = vqrshrun_n_s16(q7s16, 7);
14807bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d8u8 = vqrshrun_n_s16(q8s16, 7);
14817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d9u8 = vqrshrun_n_s16(q9s16, 7);
14827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u8 = vcombine_u8(d6u8, d7u8);
14847bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u8 = vcombine_u8(d8u8, d9u8);
14857bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1q_u8(dst, q3u8);
14867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
14877bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1q_u8(dst, q4u8);
14887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
1489ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
14907bc9febe8749e98a3812a0dc4380ceae75c29450Johann    return;
14917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
14927bc9febe8749e98a3812a0dc4380ceae75c29450Johann
14937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  src = src_ptr - 2 - src_pixels_per_line * 2;
14947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  tmpp = tmp;
14957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  for (i = 0; i < 7; ++i) {
14967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d6u8 = vld1_u8(src);
14977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d7u8 = vld1_u8(src + 8);
14987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d8u8 = vld1_u8(src + 16);
14997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
15007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d9u8 = vld1_u8(src);
15017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d10u8 = vld1_u8(src + 8);
15027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d11u8 = vld1_u8(src + 16);
15037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
15047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d12u8 = vld1_u8(src);
15057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d13u8 = vld1_u8(src + 8);
15067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d14u8 = vld1_u8(src + 16);
15077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    src += src_pixels_per_line;
1508ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src);
15107bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src + src_pixels_per_line);
15117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __builtin_prefetch(src + src_pixels_per_line * 2);
15127bc9febe8749e98a3812a0dc4380ceae75c29450Johann
15137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmull_u8(d6u8, d0u8);
15147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmull_u8(d7u8, d0u8);
15157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmull_u8(d9u8, d0u8);
15167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11u16 = vmull_u8(d10u8, d0u8);
15177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12u16 = vmull_u8(d12u8, d0u8);
15187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13u16 = vmull_u8(d13u8, d0u8);
15197bc9febe8749e98a3812a0dc4380ceae75c29450Johann
15207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d6u8, d7u8, 1);
15217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d9u8, d10u8, 1);
15227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d12u8, d13u8, 1);
15237bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlsl_u8(q8u16, d28u8, d1u8);
15247bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlsl_u8(q10u16, d29u8, d1u8);
15257bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12u16 = vmlsl_u8(q12u16, d30u8, d1u8);
15267bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d7u8, d8u8, 1);
15277bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d10u8, d11u8, 1);
15287bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d13u8, d14u8, 1);
15297bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlsl_u8(q9u16, d28u8, d1u8);
15307bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11u16 = vmlsl_u8(q11u16, d29u8, d1u8);
15317bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13u16 = vmlsl_u8(q13u16, d30u8, d1u8);
1532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15337bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d6u8, d7u8, 4);
15347bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d9u8, d10u8, 4);
15357bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d12u8, d13u8, 4);
15367bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlsl_u8(q8u16, d28u8, d4u8);
15377bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlsl_u8(q10u16, d29u8, d4u8);
15387bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12u16 = vmlsl_u8(q12u16, d30u8, d4u8);
15397bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d7u8, d8u8, 4);
15407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d10u8, d11u8, 4);
15417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d13u8, d14u8, 4);
15427bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlsl_u8(q9u16, d28u8, d4u8);
15437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11u16 = vmlsl_u8(q11u16, d29u8, d4u8);
15447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13u16 = vmlsl_u8(q13u16, d30u8, d4u8);
1545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d6u8, d7u8, 5);
15477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d9u8, d10u8, 5);
15487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d12u8, d13u8, 5);
15497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlal_u8(q8u16, d28u8, d5u8);
15507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlal_u8(q10u16, d29u8, d5u8);
15517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12u16 = vmlal_u8(q12u16, d30u8, d5u8);
15527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d7u8, d8u8, 5);
15537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d10u8, d11u8, 5);
15547bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d13u8, d14u8, 5);
15557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlal_u8(q9u16, d28u8, d5u8);
15567bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11u16 = vmlal_u8(q11u16, d29u8, d5u8);
15577bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13u16 = vmlal_u8(q13u16, d30u8, d5u8);
1558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15597bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d6u8, d7u8, 2);
15607bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d9u8, d10u8, 2);
15617bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d12u8, d13u8, 2);
15627bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8u16 = vmlal_u8(q8u16, d28u8, d2u8);
15637bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10u16 = vmlal_u8(q10u16, d29u8, d2u8);
15647bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12u16 = vmlal_u8(q12u16, d30u8, d2u8);
15657bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d7u8, d8u8, 2);
15667bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d10u8, d11u8, 2);
15677bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d13u8, d14u8, 2);
15687bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9u16 = vmlal_u8(q9u16, d28u8, d2u8);
15697bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11u16 = vmlal_u8(q11u16, d29u8, d2u8);
15707bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13u16 = vmlal_u8(q13u16, d30u8, d2u8);
15717bc9febe8749e98a3812a0dc4380ceae75c29450Johann
15727bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d28u8 = vext_u8(d6u8, d7u8, 3);
15737bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d29u8 = vext_u8(d9u8, d10u8, 3);
15747bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d30u8 = vext_u8(d12u8, d13u8, 3);
15757bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d15u8 = vext_u8(d7u8, d8u8, 3);
15767bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d31u8 = vext_u8(d10u8, d11u8, 3);
15777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d6u8 = vext_u8(d13u8, d14u8, 3);
15787bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4u16 = vmull_u8(d28u8, d3u8);
15797bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5u16 = vmull_u8(d29u8, d3u8);
15807bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmull_u8(d30u8, d3u8);
15817bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q4s16 = vreinterpretq_s16_u16(q4u16);
15827bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q5s16 = vreinterpretq_s16_u16(q5u16);
15837bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6s16 = vreinterpretq_s16_u16(q6u16);
15847bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8s16 = vreinterpretq_s16_u16(q8u16);
15857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10s16 = vreinterpretq_s16_u16(q10u16);
15867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12s16 = vreinterpretq_s16_u16(q12u16);
15877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q8s16 = vqaddq_s16(q8s16, q4s16);
15887bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q10s16 = vqaddq_s16(q10s16, q5s16);
15897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q12s16 = vqaddq_s16(q12s16, q6s16);
1590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6u16 = vmull_u8(d15u8, d3u8);
15927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7u16 = vmull_u8(d31u8, d3u8);
15937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3u16 = vmull_u8(d6u8, d3u8);
15947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q3s16 = vreinterpretq_s16_u16(q3u16);
15957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q6s16 = vreinterpretq_s16_u16(q6u16);
15967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q7s16 = vreinterpretq_s16_u16(q7u16);
15977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9s16 = vreinterpretq_s16_u16(q9u16);
15987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11s16 = vreinterpretq_s16_u16(q11u16);
15997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13s16 = vreinterpretq_s16_u16(q13u16);
16007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q9s16 = vqaddq_s16(q9s16, q6s16);
16017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q11s16 = vqaddq_s16(q11s16, q7s16);
16027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    q13s16 = vqaddq_s16(q13s16, q3s16);
16037bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d6u8 = vqrshrun_n_s16(q8s16, 7);
16057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d7u8 = vqrshrun_n_s16(q9s16, 7);
16067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d8u8 = vqrshrun_n_s16(q10s16, 7);
16077bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d9u8 = vqrshrun_n_s16(q11s16, 7);
16087bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d10u8 = vqrshrun_n_s16(q12s16, 7);
16097bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d11u8 = vqrshrun_n_s16(q13s16, 7);
16107bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16117bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d6u8);
16127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16137bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d7u8);
16147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d8u8);
16167bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16177bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d9u8);
16187bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16197bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d10u8);
16207bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16217bc9febe8749e98a3812a0dc4380ceae75c29450Johann    vst1_u8(tmpp, d11u8);
16227bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 8;
16237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
16247bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  // Second pass: 16x16
16267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  dtmps8 = vld1_s8(vp8_sub_pel_filters[yoffset]);
16277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0s8 = vdup_lane_s8(dtmps8, 0);
16287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1s8 = vdup_lane_s8(dtmps8, 1);
16297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2s8 = vdup_lane_s8(dtmps8, 2);
16307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3s8 = vdup_lane_s8(dtmps8, 3);
16317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4s8 = vdup_lane_s8(dtmps8, 4);
16327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5s8 = vdup_lane_s8(dtmps8, 5);
16337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d0u8 = vreinterpret_u8_s8(vabs_s8(d0s8));
16347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d1u8 = vreinterpret_u8_s8(vabs_s8(d1s8));
16357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d2u8 = vreinterpret_u8_s8(vabs_s8(d2s8));
16367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d3u8 = vreinterpret_u8_s8(vabs_s8(d3s8));
16377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d4u8 = vreinterpret_u8_s8(vabs_s8(d4s8));
16387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d5u8 = vreinterpret_u8_s8(vabs_s8(d5s8));
16397bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  for (i = 0; i < 2; ++i) {
16417bc9febe8749e98a3812a0dc4380ceae75c29450Johann    dst = dst_ptr + 8 * i;
16427bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp = tmp + 8 * i;
16437bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d18u8 = vld1_u8(tmpp);
16447bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 16;
16457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d19u8 = vld1_u8(tmpp);
16467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 16;
16477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d20u8 = vld1_u8(tmpp);
16487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 16;
16497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d21u8 = vld1_u8(tmpp);
16507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 16;
16517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    d22u8 = vld1_u8(tmpp);
16527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    tmpp += 16;
16537bc9febe8749e98a3812a0dc4380ceae75c29450Johann    for (j = 0; j < 4; ++j) {
16547bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d23u8 = vld1_u8(tmpp);
16557bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 16;
16567bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d24u8 = vld1_u8(tmpp);
16577bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 16;
16587bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d25u8 = vld1_u8(tmpp);
16597bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 16;
16607bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d26u8 = vld1_u8(tmpp);
16617bc9febe8749e98a3812a0dc4380ceae75c29450Johann      tmpp += 16;
16627bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16637bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmull_u8(d18u8, d0u8);
16647bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmull_u8(d19u8, d0u8);
16657bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmull_u8(d20u8, d0u8);
16667bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmull_u8(d21u8, d0u8);
16677bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlsl_u8(q3u16, d19u8, d1u8);
16697bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlsl_u8(q4u16, d20u8, d1u8);
16707bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlsl_u8(q5u16, d21u8, d1u8);
16717bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d22u8, d1u8);
16727bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16737bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlsl_u8(q3u16, d22u8, d4u8);
16747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlsl_u8(q4u16, d23u8, d4u8);
16757bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlsl_u8(q5u16, d24u8, d4u8);
16767bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlsl_u8(q6u16, d25u8, d4u8);
16777bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16787bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlal_u8(q3u16, d20u8, d2u8);
16797bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlal_u8(q4u16, d21u8, d2u8);
16807bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlal_u8(q5u16, d22u8, d2u8);
16817bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d23u8, d2u8);
16827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3u16 = vmlal_u8(q3u16, d23u8, d5u8);
16847bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4u16 = vmlal_u8(q4u16, d24u8, d5u8);
16857bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5u16 = vmlal_u8(q5u16, d25u8, d5u8);
16867bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6u16 = vmlal_u8(q6u16, d26u8, d5u8);
16877bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16887bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7u16 = vmull_u8(d21u8, d3u8);
16897bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8u16 = vmull_u8(d22u8, d3u8);
16907bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9u16 = vmull_u8(d23u8, d3u8);
16917bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10u16 = vmull_u8(d24u8, d3u8);
16927bc9febe8749e98a3812a0dc4380ceae75c29450Johann
16937bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q3s16 = vreinterpretq_s16_u16(q3u16);
16947bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q4s16 = vreinterpretq_s16_u16(q4u16);
16957bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q5s16 = vreinterpretq_s16_u16(q5u16);
16967bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q6s16 = vreinterpretq_s16_u16(q6u16);
16977bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vreinterpretq_s16_u16(q7u16);
16987bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vreinterpretq_s16_u16(q8u16);
16997bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vreinterpretq_s16_u16(q9u16);
17007bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10s16 = vreinterpretq_s16_u16(q10u16);
17017bc9febe8749e98a3812a0dc4380ceae75c29450Johann
17027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q7s16 = vqaddq_s16(q7s16, q3s16);
17037bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q8s16 = vqaddq_s16(q8s16, q4s16);
17047bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q9s16 = vqaddq_s16(q9s16, q5s16);
17057bc9febe8749e98a3812a0dc4380ceae75c29450Johann      q10s16 = vqaddq_s16(q10s16, q6s16);
17067bc9febe8749e98a3812a0dc4380ceae75c29450Johann
17077bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d6u8 = vqrshrun_n_s16(q7s16, 7);
17087bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d7u8 = vqrshrun_n_s16(q8s16, 7);
17097bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d8u8 = vqrshrun_n_s16(q9s16, 7);
17107bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d9u8 = vqrshrun_n_s16(q10s16, 7);
17117bc9febe8749e98a3812a0dc4380ceae75c29450Johann
17127bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d18u8 = d22u8;
17137bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d19u8 = d23u8;
17147bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d20u8 = d24u8;
17157bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d21u8 = d25u8;
17167bc9febe8749e98a3812a0dc4380ceae75c29450Johann      d22u8 = d26u8;
17177bc9febe8749e98a3812a0dc4380ceae75c29450Johann
17187bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst, d6u8);
17197bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
17207bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst, d7u8);
17217bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
17227bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst, d8u8);
17237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
17247bc9febe8749e98a3812a0dc4380ceae75c29450Johann      vst1_u8(dst, d9u8);
17257bc9febe8749e98a3812a0dc4380ceae75c29450Johann      dst += dst_pitch;
1726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
17277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  }
17287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
1729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1730