1b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/*
2b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *
4b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian */
10b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
11b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include <arm_neon.h>
12b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
137bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vp8_rtcd.h"
147bc9febe8749e98a3812a0dc4380ceae75c29450Johann
15b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic const int16_t cospi8sqrt2minus1 = 20091;
16653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
17653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// the way it is used in vqdmulh, where the result is doubled, it can be divided
18653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// by 2 beforehand. This saves compensating for the negative value as well as
19653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// shifting the result.
20653cb5bbe266083a3c5c40eeafb5e88bededd566Johannstatic const int16_t sinpi8sqrt2 = 35468 >> 1;
21653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
22653cb5bbe266083a3c5c40eeafb5e88bededd566Johannvoid vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
23653cb5bbe266083a3c5c40eeafb5e88bededd566Johann                               int stride) {
24653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  unsigned char *dst0;
25653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int32x2_t d14, d15;
26653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
27653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int16x8_t q1, q2, q3, q4, q5, q6;
28653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int16x8_t qEmpty = vdupq_n_s16(0);
29653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int32x2x2_t d2tmp0, d2tmp1;
30653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  int16x4x2_t d2tmp2, d2tmp3;
31653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
32653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d14 = d15 = vdup_n_s32(0);
33653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
34653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  // load input
35653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q3 = vld1q_s16(input);
36653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1q_s16(input, qEmpty);
37653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  input += 8;
38653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vld1q_s16(input);
39653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1q_s16(input, qEmpty);
40653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
41653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  // load dq
42653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q5 = vld1q_s16(dq);
43653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dq += 8;
44653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q6 = vld1q_s16(dq);
45653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
46653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  // load src from dst
47653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 = dst;
48653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
49653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
50653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
51653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
52653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
53653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
54653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
55653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
56653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q1 = vreinterpretq_s16_u16(
57653cb5bbe266083a3c5c40eeafb5e88bededd566Johann      vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5)));
58653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q2 = vreinterpretq_s16_u16(
59653cb5bbe266083a3c5c40eeafb5e88bededd566Johann      vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6)));
60653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
61653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
62653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
63653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
64653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
65653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
66653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
67653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
68653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
69653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vshrq_n_s16(q4, 1);
70653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
71653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vqaddq_s16(q4, q2);
72653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
73653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
74653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
75653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
76653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2 = vqadd_s16(d12, d11);
77653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d3 = vqadd_s16(d13, d10);
78653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d4 = vqsub_s16(d13, d10);
79653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d5 = vqsub_s16(d12, d11);
80653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
81653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
82653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
83653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
84653cb5bbe266083a3c5c40eeafb5e88bededd566Johann                    vreinterpret_s16_s32(d2tmp1.val[0]));
85653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
86653cb5bbe266083a3c5c40eeafb5e88bededd566Johann                    vreinterpret_s16_s32(d2tmp1.val[1]));
87653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
88653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  // loop 2
89653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
90653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
91653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
92653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
93653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
94653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
95653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
96653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
97653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vshrq_n_s16(q4, 1);
98653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
99653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q4 = vqaddq_s16(q4, q2);
100653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
101653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
102653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
103653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
104653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2 = vqadd_s16(d12, d11);
105653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d3 = vqadd_s16(d13, d10);
106653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d4 = vqsub_s16(d13, d10);
107653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d5 = vqsub_s16(d12, d11);
108653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
109653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2 = vrshr_n_s16(d2, 3);
110653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d3 = vrshr_n_s16(d3, 3);
111653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d4 = vrshr_n_s16(d4, 3);
112653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d5 = vrshr_n_s16(d5, 3);
113653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
114653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
115653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
116653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
117653cb5bbe266083a3c5c40eeafb5e88bededd566Johann                    vreinterpret_s16_s32(d2tmp1.val[0]));
118653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
119653cb5bbe266083a3c5c40eeafb5e88bededd566Johann                    vreinterpret_s16_s32(d2tmp1.val[1]));
120653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
121653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
122653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
123653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
124653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q1 = vreinterpretq_s16_u16(
125653cb5bbe266083a3c5c40eeafb5e88bededd566Johann      vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14)));
126653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  q2 = vreinterpretq_s16_u16(
127653cb5bbe266083a3c5c40eeafb5e88bededd566Johann      vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15)));
128653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
129653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
130653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
131653cb5bbe266083a3c5c40eeafb5e88bededd566Johann
132653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 = dst;
133653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1_lane_s32((int32_t *)dst0, d14, 0);
134653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
135653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1_lane_s32((int32_t *)dst0, d14, 1);
136653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
137653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1_lane_s32((int32_t *)dst0, d15, 0);
138653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  dst0 += stride;
139653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  vst1_lane_s32((int32_t *)dst0, d15, 1);
140653cb5bbe266083a3c5c40eeafb5e88bededd566Johann  return;
141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
142