1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_dsp/arm/idct_neon.h" 15#include "vpx_dsp/inv_txfm.h" 16 17static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, 18 const int stride, 19 const int16x8_t res, 20 const int16x8_t max) { 21 const uint16x8_t a0 = vld1q_u16(*dest); 22 const uint16x8_t a1 = vld1q_u16(*dest + 8); 23 const uint16x8_t a2 = vld1q_u16(*dest + 16); 24 const uint16x8_t a3 = vld1q_u16(*dest + 24); 25 const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); 26 const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); 27 const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); 28 const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); 29 const int16x8_t c0 = vminq_s16(b0, max); 30 const int16x8_t c1 = vminq_s16(b1, max); 31 const int16x8_t c2 = vminq_s16(b2, max); 32 const int16x8_t c3 = vminq_s16(b3, max); 33 vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); 34 vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); 35 vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); 36 vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); 37 *dest += stride; 38} 39 40static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, 41 const int stride, 42 const int16x8_t res) { 43 const uint16x8_t a0 = vld1q_u16(*dest); 44 const uint16x8_t a1 = vld1q_u16(*dest + 8); 45 const uint16x8_t a2 = vld1q_u16(*dest + 16); 46 const uint16x8_t a3 = vld1q_u16(*dest + 24); 47 const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); 48 const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); 49 const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); 50 const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); 51 const uint16x8_t c0 = vqshluq_n_s16(b0, 0); 52 const uint16x8_t c1 = vqshluq_n_s16(b1, 0); 53 const uint16x8_t c2 = vqshluq_n_s16(b2, 0); 54 const uint16x8_t c3 = vqshluq_n_s16(b3, 0); 55 vst1q_u16(*dest, c0); 56 vst1q_u16(*dest + 8, c1); 57 vst1q_u16(*dest + 16, c2); 58 vst1q_u16(*dest + 24, c3); 59 *dest += stride; 60} 61 62void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, 63 int stride, int bd) { 64 const tran_low_t out0 = 65 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 66 const tran_low_t out1 = 67 HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); 68 const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); 69 const int16x8_t dc = vdupq_n_s16(a1); 70 int i; 71 72 if (a1 >= 0) { 73 const int16x8_t max = vdupq_n_s16((1 << bd) - 1); 74 for (i = 0; i < 8; ++i) { 75 highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); 76 highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); 77 highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); 78 highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); 79 } 80 } else { 81 for (i = 0; i < 8; ++i) { 82 highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); 83 highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); 84 highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); 85 highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); 86 } 87 } 88} 89