1411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org/*
2411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *
4411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  Use of this source code is governed by a BSD-style license
5411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  that can be found in the LICENSE file in the root of the source
6411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  tree. An additional intellectual property rights grant can be found
7411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  in the file PATENTS.  All contributing project authors may
8411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org *  be found in the AUTHORS file in the root of the source tree.
9411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org */
10411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
11411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org#include <arm_neon.h>
12411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
13411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgstatic const int16_t cospi8sqrt2minus1 = 20091;
14411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgstatic const int16_t sinpi8sqrt2       = 35468;
15411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
16411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.orgvoid vp8_dequant_idct_add_neon(
17411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        int16_t *input,
18411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        int16_t *dq,
19411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        unsigned char *dst,
20411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org        int stride) {
21411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    unsigned char *dst0;
22411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int32x2_t d14, d15;
23411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
24411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int16x8_t q1, q2, q3, q4, q5, q6;
25411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int16x8_t qEmpty = vdupq_n_s16(0);
26411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int32x2x2_t d2tmp0, d2tmp1;
27411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    int16x4x2_t d2tmp2, d2tmp3;
28411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
29411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d14 = d15 = vdup_n_s32(0);
30411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
31411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load input
32411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vld1q_s16(input);
33411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1q_s16(input, qEmpty);
34411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    input += 8;
35411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vld1q_s16(input);
36411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1q_s16(input, qEmpty);
37411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
38411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load dq
39411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q5 = vld1q_s16(dq);
40411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dq += 8;
41411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q6 = vld1q_s16(dq);
42411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
43411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // load src from dst
44411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 = dst;
45411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
46411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
47411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
48411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
49411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
50411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
51411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
52411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
53411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
54411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         vreinterpretq_u16_s16(q5)));
55411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
56411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                         vreinterpretq_u16_s16(q6)));
57411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
58411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
59411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
60411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
61411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
62411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
63411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
64411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
65411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
66411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vshrq_n_s16(q3, 1);
67411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vshrq_n_s16(q4, 1);
68411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
69411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vqaddq_s16(q3, q2);
70411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vqaddq_s16(q4, q2);
71411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
72411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
73411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
74411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
75411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2 = vqadd_s16(d12, d11);
76411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d3 = vqadd_s16(d13, d10);
77411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d4 = vqsub_s16(d13, d10);
78411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d5 = vqsub_s16(d12, d11);
79411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
80411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
81411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
82411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
83411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                      vreinterpret_s16_s32(d2tmp1.val[0]));
84411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
85411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                      vreinterpret_s16_s32(d2tmp1.val[1]));
86411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
87411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    // loop 2
88411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
89411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
90411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
91411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
92411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
93411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
94411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
95411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
96411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vshrq_n_s16(q3, 1);
97411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vshrq_n_s16(q4, 1);
98411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
99411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q3 = vqaddq_s16(q3, q2);
100411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q4 = vqaddq_s16(q4, q2);
101411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
102411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
103411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
104411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
105411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2 = vqadd_s16(d12, d11);
106411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d3 = vqadd_s16(d13, d10);
107411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d4 = vqsub_s16(d13, d10);
108411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d5 = vqsub_s16(d12, d11);
109411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
110411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2 = vrshr_n_s16(d2, 3);
111411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d3 = vrshr_n_s16(d3, 3);
112411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d4 = vrshr_n_s16(d4, 3);
113411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d5 = vrshr_n_s16(d5, 3);
114411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
115411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
116411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
117411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
118411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                      vreinterpret_s16_s32(d2tmp1.val[0]));
119411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
120411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                      vreinterpret_s16_s32(d2tmp1.val[1]));
121411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
122411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
123411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
124411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
125411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
126411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                        vreinterpret_u8_s32(d14)));
127411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
128411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org                                        vreinterpret_u8_s32(d15)));
129411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
130411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
131411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
132411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org
133411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 = dst;
134411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1_lane_s32((int32_t *)dst0, d14, 0);
135411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
136411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1_lane_s32((int32_t *)dst0, d14, 1);
137411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
138411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1_lane_s32((int32_t *)dst0, d15, 0);
139411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    dst0 += stride;
140411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    vst1_lane_s32((int32_t *)dst0, d15, 1);
141411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org    return;
142411971f94253c85e1866c281860d6344f6aa0c78fgalligan@chromium.org}
143