1b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian/* 2b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * 4b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian */ 10b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 11b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian#include <arm_neon.h> 12b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian 137bc9febe8749e98a3812a0dc4380ceae75c29450Johann#include "./vp8_rtcd.h" 147bc9febe8749e98a3812a0dc4380ceae75c29450Johann 15b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianstatic const int16_t cospi8sqrt2minus1 = 20091; 16653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of 17653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// the way it is used in vqdmulh, where the result is doubled, it can be divided 18653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// by 2 beforehand. This saves compensating for the negative value as well as 19653cb5bbe266083a3c5c40eeafb5e88bededd566Johann// shifting the result. 20653cb5bbe266083a3c5c40eeafb5e88bededd566Johannstatic const int16_t sinpi8sqrt2 = 35468 >> 1; 21653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 22653cb5bbe266083a3c5c40eeafb5e88bededd566Johannvoid vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst, 23653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int stride) { 24653cb5bbe266083a3c5c40eeafb5e88bededd566Johann unsigned char *dst0; 25653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int32x2_t d14, d15; 26653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; 27653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int16x8_t q1, q2, q3, q4, q5, q6; 28653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int16x8_t qEmpty = vdupq_n_s16(0); 29653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int32x2x2_t d2tmp0, d2tmp1; 30653cb5bbe266083a3c5c40eeafb5e88bededd566Johann int16x4x2_t d2tmp2, d2tmp3; 31653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 32653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d14 = d15 = vdup_n_s32(0); 33653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 34653cb5bbe266083a3c5c40eeafb5e88bededd566Johann // load input 35653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q3 = vld1q_s16(input); 36653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1q_s16(input, qEmpty); 37653cb5bbe266083a3c5c40eeafb5e88bededd566Johann input += 8; 38653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vld1q_s16(input); 39653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1q_s16(input, qEmpty); 40653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 41653cb5bbe266083a3c5c40eeafb5e88bededd566Johann // load dq 42653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q5 = vld1q_s16(dq); 43653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dq += 8; 44653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q6 = vld1q_s16(dq); 45653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 46653cb5bbe266083a3c5c40eeafb5e88bededd566Johann // load src from dst 47653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 = dst; 48653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); 49653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 50653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); 51653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 52653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); 53653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 54653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); 55653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 56653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q1 = vreinterpretq_s16_u16( 57653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5))); 58653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q2 = vreinterpretq_s16_u16( 59653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6))); 60653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 61653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); 62653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); 63653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 64653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); 65653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 66653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); 67653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); 68653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 69653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vshrq_n_s16(q4, 1); 70653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 71653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vqaddq_s16(q4, q2); 72653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 73653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); 74653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); 75653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 76653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2 = vqadd_s16(d12, d11); 77653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d3 = vqadd_s16(d13, d10); 78653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d4 = vqsub_s16(d13, d10); 79653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d5 = vqsub_s16(d12, d11); 80653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 81653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); 82653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); 83653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), 84653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vreinterpret_s16_s32(d2tmp1.val[0])); 85653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), 86653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vreinterpret_s16_s32(d2tmp1.val[1])); 87653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 88653cb5bbe266083a3c5c40eeafb5e88bededd566Johann // loop 2 89653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); 90653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 91653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); 92653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); 93653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 94653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); 95653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); 96653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 97653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vshrq_n_s16(q4, 1); 98653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 99653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q4 = vqaddq_s16(q4, q2); 100653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 101653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); 102653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); 103653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 104653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2 = vqadd_s16(d12, d11); 105653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d3 = vqadd_s16(d13, d10); 106653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d4 = vqsub_s16(d13, d10); 107653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d5 = vqsub_s16(d12, d11); 108653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 109653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2 = vrshr_n_s16(d2, 3); 110653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d3 = vrshr_n_s16(d3, 3); 111653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d4 = vrshr_n_s16(d4, 3); 112653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d5 = vrshr_n_s16(d5, 3); 113653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 114653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); 115653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); 116653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), 117653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vreinterpret_s16_s32(d2tmp1.val[0])); 118653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), 119653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vreinterpret_s16_s32(d2tmp1.val[1])); 120653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 121653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); 122653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); 123653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 124653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q1 = vreinterpretq_s16_u16( 125653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14))); 126653cb5bbe266083a3c5c40eeafb5e88bededd566Johann q2 = vreinterpretq_s16_u16( 127653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15))); 128653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 129653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); 130653cb5bbe266083a3c5c40eeafb5e88bededd566Johann d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); 131653cb5bbe266083a3c5c40eeafb5e88bededd566Johann 132653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 = dst; 133653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1_lane_s32((int32_t *)dst0, d14, 0); 134653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 135653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1_lane_s32((int32_t *)dst0, d14, 1); 136653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 137653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1_lane_s32((int32_t *)dst0, d15, 0); 138653cb5bbe266083a3c5c40eeafb5e88bededd566Johann dst0 += stride; 139653cb5bbe266083a3c5c40eeafb5e88bededd566Johann vst1_lane_s32((int32_t *)dst0, d15, 1); 140653cb5bbe266083a3c5c40eeafb5e88bededd566Johann return; 141b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian} 142