1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/* 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan * 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan * that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan * tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan * in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan * be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */ 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <arm_neon.h> 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan 13233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const int16_t cospi8sqrt2minus1 = 20091; 14233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic const int16_t sinpi8sqrt2 = 35468; 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan 16233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp8_dequant_idct_add_neon( 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16_t *input, 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16_t *dq, 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst, 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan int stride) { 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst0; 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32x2_t d14, d15; 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16x8_t q1, q2, q3, q4, q5, q6; 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16x8_t qEmpty = vdupq_n_s16(0); 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32x2x2_t d2tmp0, d2tmp1; 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan int16x4x2_t d2tmp2, d2tmp3; 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan d14 = d15 = vdup_n_s32(0); 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan // load input 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vld1q_s16(input); 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1q_s16(input, qEmpty); 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan input += 8; 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vld1q_s16(input); 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1q_s16(input, qEmpty); 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan // load dq 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5 = vld1q_s16(dq); 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan dq += 8; 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6 = vld1q_s16(dq); 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan // load src from dst 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 = dst; 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0); 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1); 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0); 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1); 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3), 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpretq_u16_s16(q5))); 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4), 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpretq_u16_s16(q6))); 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2)); 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2)); 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2)); 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vshrq_n_s16(q3, 1); 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vshrq_n_s16(q4, 1); 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vqaddq_s16(q3, q2); 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vqaddq_s16(q4, q2); 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2 = vqadd_s16(d12, d11); 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan d3 = vqadd_s16(d13, d10); 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan d4 = vqsub_s16(d13, d10); 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan d5 = vqsub_s16(d12, d11); 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_s16_s32(d2tmp1.val[0])); 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_s16_s32(d2tmp1.val[1])); 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan // loop 2 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]); 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2); 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1); 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]); 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]); 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vshrq_n_s16(q3, 1); 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vshrq_n_s16(q4, 1); 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = vqaddq_s16(q3, q2); 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = vqaddq_s16(q4, q2); 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4)); 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4)); 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2 = vqadd_s16(d12, d11); 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan d3 = vqadd_s16(d13, d10); 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan d4 = vqsub_s16(d13, d10); 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan d5 = vqsub_s16(d12, d11); 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2 = vrshr_n_s16(d2, 3); 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan d3 = vrshr_n_s16(d3, 3); 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan d4 = vrshr_n_s16(d4, 3); 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan d5 = vrshr_n_s16(d5, 3); 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]), 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_s16_s32(d2tmp1.val[0])); 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]), 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_s16_s32(d2tmp1.val[1])); 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]); 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]); 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_u8_s32(d14))); 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan vreinterpret_u8_s32(d15))); 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan d14 = vreinterpret_s32_u8(vqmovun_s16(q1)); 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan d15 = vreinterpret_s32_u8(vqmovun_s16(q2)); 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 = dst; 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1_lane_s32((int32_t *)dst0, d14, 0); 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1_lane_s32((int32_t *)dst0, d14, 1); 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1_lane_s32((int32_t *)dst0, d15, 0); 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst0 += stride; 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan vst1_lane_s32((int32_t *)dst0, d15, 1); 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan return; 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 143