1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_dsp/arm/idct_neon.h"
15#include "vpx_dsp/inv_txfm.h"
16
17static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
18                                              const uint8x16_t res) {
19  const uint8x16_t a = vld1q_u8(*dest);
20  const uint8x16_t b = vqaddq_u8(a, res);
21  vst1q_u8(*dest, b);
22  *dest += stride;
23}
24
25static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
26                                              const uint8x16_t res) {
27  const uint8x16_t a = vld1q_u8(*dest);
28  const uint8x16_t b = vqsubq_u8(a, res);
29  vst1q_u8(*dest, b);
30  *dest += stride;
31}
32
33void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
34                              int stride) {
35  const int16_t out0 =
36      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
37  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
38  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
39
40  if (a1 >= 0) {
41    const uint8x16_t dc = create_dcq(a1);
42    idct16x16_1_add_pos_kernel(&dest, stride, dc);
43    idct16x16_1_add_pos_kernel(&dest, stride, dc);
44    idct16x16_1_add_pos_kernel(&dest, stride, dc);
45    idct16x16_1_add_pos_kernel(&dest, stride, dc);
46    idct16x16_1_add_pos_kernel(&dest, stride, dc);
47    idct16x16_1_add_pos_kernel(&dest, stride, dc);
48    idct16x16_1_add_pos_kernel(&dest, stride, dc);
49    idct16x16_1_add_pos_kernel(&dest, stride, dc);
50    idct16x16_1_add_pos_kernel(&dest, stride, dc);
51    idct16x16_1_add_pos_kernel(&dest, stride, dc);
52    idct16x16_1_add_pos_kernel(&dest, stride, dc);
53    idct16x16_1_add_pos_kernel(&dest, stride, dc);
54    idct16x16_1_add_pos_kernel(&dest, stride, dc);
55    idct16x16_1_add_pos_kernel(&dest, stride, dc);
56    idct16x16_1_add_pos_kernel(&dest, stride, dc);
57    idct16x16_1_add_pos_kernel(&dest, stride, dc);
58  } else {
59    const uint8x16_t dc = create_dcq(-a1);
60    idct16x16_1_add_neg_kernel(&dest, stride, dc);
61    idct16x16_1_add_neg_kernel(&dest, stride, dc);
62    idct16x16_1_add_neg_kernel(&dest, stride, dc);
63    idct16x16_1_add_neg_kernel(&dest, stride, dc);
64    idct16x16_1_add_neg_kernel(&dest, stride, dc);
65    idct16x16_1_add_neg_kernel(&dest, stride, dc);
66    idct16x16_1_add_neg_kernel(&dest, stride, dc);
67    idct16x16_1_add_neg_kernel(&dest, stride, dc);
68    idct16x16_1_add_neg_kernel(&dest, stride, dc);
69    idct16x16_1_add_neg_kernel(&dest, stride, dc);
70    idct16x16_1_add_neg_kernel(&dest, stride, dc);
71    idct16x16_1_add_neg_kernel(&dest, stride, dc);
72    idct16x16_1_add_neg_kernel(&dest, stride, dc);
73    idct16x16_1_add_neg_kernel(&dest, stride, dc);
74    idct16x16_1_add_neg_kernel(&dest, stride, dc);
75    idct16x16_1_add_neg_kernel(&dest, stride, dc);
76  }
77}
78