1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <arm_neon.h>
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan
13233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const int16_t cospi8sqrt2minus1 = 20091;
14233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const int16_t sinpi8sqrt2       = 35468;
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan
16233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_dequant_idct_add_neon(
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan        int16_t *input,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan        int16_t *dq,
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan        unsigned char *dst,
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan        int stride) {
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *dst0;
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int32x2_t d14, d15;
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int16x8_t q1, q2, q3, q4, q5, q6;
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int16x8_t qEmpty = vdupq_n_s16(0);
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int32x2x2_t d2tmp0, d2tmp1;
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int16x4x2_t d2tmp2, d2tmp3;
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d14 = d15 = vdup_n_s32(0);
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // load input
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vld1q_s16(input);
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1q_s16(input, qEmpty);
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan    input += 8;
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vld1q_s16(input);
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1q_s16(input, qEmpty);
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // load dq
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q5 = vld1q_s16(dq);
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dq += 8;
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q6 = vld1q_s16(dq);
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // load src from dst
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 = dst;
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                         vreinterpretq_u16_s16(q5)));
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                         vreinterpretq_u16_s16(q6)));
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vshrq_n_s16(q3, 1);
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vshrq_n_s16(q4, 1);
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vqaddq_s16(q3, q2);
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vqaddq_s16(q4, q2);
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2 = vqadd_s16(d12, d11);
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d3 = vqadd_s16(d13, d10);
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d4 = vqsub_s16(d13, d10);
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d5 = vqsub_s16(d12, d11);
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      vreinterpret_s16_s32(d2tmp1.val[0]));
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      vreinterpret_s16_s32(d2tmp1.val[1]));
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // loop 2
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vshrq_n_s16(q3, 1);
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vshrq_n_s16(q4, 1);
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3 = vqaddq_s16(q3, q2);
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4 = vqaddq_s16(q4, q2);
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2 = vqadd_s16(d12, d11);
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d3 = vqadd_s16(d13, d10);
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d4 = vqsub_s16(d13, d10);
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d5 = vqsub_s16(d12, d11);
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2 = vrshr_n_s16(d2, 3);
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d3 = vrshr_n_s16(d3, 3);
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d4 = vrshr_n_s16(d4, 3);
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d5 = vrshr_n_s16(d5, 3);
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      vreinterpret_s16_s32(d2tmp1.val[0]));
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan                      vreinterpret_s16_s32(d2tmp1.val[1]));
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        vreinterpret_u8_s32(d14)));
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        vreinterpret_u8_s32(d15)));
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 = dst;
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1_lane_s32((int32_t *)dst0, d14, 0);
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1_lane_s32((int32_t *)dst0, d14, 1);
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1_lane_s32((int32_t *)dst0, d15, 0);
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    dst0 += stride;
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    vst1_lane_s32((int32_t *)dst0, d15, 1);
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    return;
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
143