1da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian/*
2da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *
4da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian */
10da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <arm_neon.h>
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include <assert.h>
13da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_config.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx/vpx_integer.h"
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
18da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
19da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianstatic INLINE int32x4_t MULTIPLY_BY_Q0(
20da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc0,
21da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc1,
22da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc2,
23da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc3,
24da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc4,
25da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc5,
26da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc6,
27da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x4_t dsrc7,
28da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int16x8_t q0s16) {
29da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32x4_t qdst;
30da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x4_t d0s16, d1s16;
31da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
32da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  d0s16 = vget_low_s16(q0s16);
33da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  d1s16 = vget_high_s16(q0s16);
34da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
35da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmull_lane_s16(dsrc0, d0s16, 0);
36da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
37da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
38da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
39da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
40da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
41da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
42da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
43da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return qdst;
44da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
45da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
46da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_horiz_neon(
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const uint8_t *src,
48da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ptrdiff_t src_stride,
49da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint8_t *dst,
50da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ptrdiff_t dst_stride,
51da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int16_t *filter_x,
52da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int x_step_q4,
53da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int16_t *filter_y,  // unused
54da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int y_step_q4,            // unused
55da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int w,
56da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int h) {
57da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int width;
58da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const uint8_t *s, *psrc;
59da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *d, *pdst;
60da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
61da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
62da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8x16_t q12u8, q13u8, q14u8, q15u8;
63da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
64da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x4_t d24s16, d25s16, d26s16, d27s16;
65da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
66da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x8_t q0s16;
67da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
68da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32x4_t q1s32, q2s32, q14s32, q15s32;
69da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x8x2_t q0x2u16;
70da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8x8x2_t d0x2u8, d1x2u8;
71da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32x2x2_t d0x2u32;
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x4x2_t d0x2u16, d1x2u16;
73da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32x4x2_t q0x2u32;
74da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
75da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  q0s16 = vld1q_s16(filter_x);
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= 3;  // adjust for taps
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (; h > 0; h -= 4,
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    src += src_stride * 4,
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    dst += dst_stride * 4) {  // loop_horiz_v
83da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s = src;
84da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d24u8 = vld1_u8(s);
85da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
86da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d25u8 = vld1_u8(s);
87da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
88da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d26u8 = vld1_u8(s);
89da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d27u8 = vld1_u8(s);
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q12u8 = vcombine_u8(d24u8, d25u8);
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q13u8 = vcombine_u8(d26u8, d27u8);
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
96da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                        vreinterpretq_u16_u8(q13u8));
97da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d0x2u8 = vtrn_u8(d24u8, d25u8);
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d1x2u8 = vtrn_u8(d26u8, d27u8);
103da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __builtin_prefetch(src + src_stride * 4);
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __builtin_prefetch(src + src_stride * 5);
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    __builtin_prefetch(src + src_stride * 6);
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q8u16  = vmovl_u8(d0x2u8.val[0]);
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q9u16  = vmovl_u8(d0x2u8.val[1]);
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q10u16 = vmovl_u8(d1x2u8.val[0]);
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q11u16 = vmovl_u8(d1x2u8.val[1]);
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d16u16 = vget_low_u16(q8u16);
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d17u16 = vget_high_u16(q8u16);
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d18u16 = vget_low_u16(q9u16);
116da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d19u16 = vget_high_u16(q9u16);
117da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
118da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q9u16 = vcombine_u16(d17u16, d19u16);
119da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
120da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
121da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
122da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (width = w, psrc = src + 7, pdst = dst;
123da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian         width > 0;
124da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
125da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s = psrc;
126da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d28u32 = vld1_dup_u32((const uint32_t *)s);
127da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
128da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d29u32 = vld1_dup_u32((const uint32_t *)s);
129da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
130da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d31u32 = vld1_dup_u32((const uint32_t *)s);
131da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
132da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d30u32 = vld1_dup_u32((const uint32_t *)s);
133da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
134da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(psrc + 64);
135da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
136da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
137da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         vreinterpret_u16_u32(d31u32));
138da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
139da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         vreinterpret_u16_u32(d30u32));
140da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
141da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
142da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
143da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
144da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
145da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(psrc + 64 + src_stride);
146da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
147da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
148da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
149da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
150da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                          vreinterpretq_u32_u8(q15u8));
151da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
152da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
153da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
154da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q12u16 = vmovl_u8(d28u8);
155da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q13u16 = vmovl_u8(d29u8);
156da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
157da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(psrc + 64 + src_stride * 2);
158da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
159da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
160da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
161da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
162da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
163da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
164da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
165da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
166da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
167da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
168da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
169da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
170da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d18s16, d19s16, d23s16, d24s16, q0s16);
171da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
172da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d19s16, d23s16, d24s16, d26s16, q0s16);
173da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
174da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d23s16, d24s16, d26s16, d27s16, q0s16);
175da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
176da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d24s16, d26s16, d27s16, d25s16, q0s16);
177da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
178da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(psrc + 60 + src_stride * 3);
179da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
180da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2u16 = vqrshrun_n_s32(q1s32, 7);
181da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d3u16 = vqrshrun_n_s32(q2s32, 7);
182da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d4u16 = vqrshrun_n_s32(q14s32, 7);
183da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d5u16 = vqrshrun_n_s32(q15s32, 7);
184da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
185da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q1u16 = vcombine_u16(d2u16, d3u16);
186da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q2u16 = vcombine_u16(d4u16, d5u16);
187da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
188da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2u8 = vqmovn_u16(q1u16);
189da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d3u8 = vqmovn_u16(q2u16);
190da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
191da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
192da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         vreinterpret_u16_u8(d3u8));
193da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
194da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                         vreinterpret_u32_u16(d0x2u16.val[1]));
195da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
196da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                       vreinterpret_u8_u32(d0x2u32.val[1]));
197da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
198da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
199da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
200da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
201da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d = pdst;
202da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d2u32, 0);
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d3u32, 0);
205da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
206da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d2u32, 1);
207da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
208da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d3u32, 1);
209da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
210da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q8u16 = q9u16;
211da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d20s16 = d23s16;
212da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q11u16 = q12u16;
213da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q9u16 = q13u16;
214da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
215da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
216da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
217da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return;
218da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
219da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
220da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_vert_neon(
221da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const uint8_t *src,
222da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ptrdiff_t src_stride,
223da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint8_t *dst,
224da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    ptrdiff_t dst_stride,
225da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int16_t *filter_x,  // unused
226da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int x_step_q4,            // unused
227da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    const int16_t *filter_y,
228da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int y_step_q4,
229da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int w,
230da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    int h) {
231da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int height;
232da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  const uint8_t *s;
233da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint8_t *d;
234da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32x2_t d2u32, d3u32;
235da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
236da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
237da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x4_t d24s16, d25s16, d26s16, d27s16;
238da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x4_t d2u16, d3u16, d4u16, d5u16;
239da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int16x8_t q0s16;
240da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
241da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  int32x4_t q1s32, q2s32, q14s32, q15s32;
242da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
243da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
244da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
245da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  src -= src_stride * 3;
246da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  q0s16 = vld1q_s16(filter_y);
247da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
248da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s = src;
249da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
250da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
251da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
252da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
253da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
254da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
255da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
256da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
257da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
258da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
259da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
260da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
261da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
262da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    s += src_stride;
263da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
264da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
265da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
266da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
267da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
268da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
269da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
270da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
271da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
272da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    d = dst;
273da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    for (height = h; height > 0; height -= 4) {  // loop_vert
274da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
275da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
276da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
277da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
278da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
279da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
280da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
281da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      s += src_stride;
282da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
283da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
284da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
285da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
286da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
287da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
288da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
289da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
290da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
291da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
292da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
293da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
294da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
295da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(d);
296da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(d + dst_stride);
297da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
298da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d20s16, d21s16, d22s16, d24s16, q0s16);
299da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(d + dst_stride * 2);
300da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(d + dst_stride * 3);
301da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
302da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d21s16, d22s16, d24s16, d26s16, q0s16);
303da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(s);
304da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(s + src_stride);
305da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
306da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d22s16, d24s16, d26s16, d27s16, q0s16);
307da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(s + src_stride * 2);
308da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      __builtin_prefetch(s + src_stride * 3);
309da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
310da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian                              d24s16, d26s16, d27s16, d25s16, q0s16);
311da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
312da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2u16 = vqrshrun_n_s32(q1s32, 7);
313da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d3u16 = vqrshrun_n_s32(q2s32, 7);
314da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d4u16 = vqrshrun_n_s32(q14s32, 7);
315da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d5u16 = vqrshrun_n_s32(q15s32, 7);
316da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
317da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q1u16 = vcombine_u16(d2u16, d3u16);
318da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q2u16 = vcombine_u16(d4u16, d5u16);
319da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
320da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
321da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
322da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
323da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d2u32, 0);
324da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
325da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d2u32, 1);
326da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
327da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d3u32, 0);
328da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
329da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      vst1_lane_u32((uint32_t *)d, d3u32, 1);
330da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d += dst_stride;
331da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
332da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q8u16 = q10u16;
333da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d18s16 = d22s16;
334da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d19s16 = d24s16;
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      q10u16 = q13u16;
336da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      d22s16 = d25s16;
337da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    }
338da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  }
339da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  return;
340da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian}
341