1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include "./vpx_config.h"
13
14static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
15    unsigned char *s, int p, const unsigned char *blimit) {
16  uint8_t *sp;
17  uint8x16_t qblimit, q0u8;
18  uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
19  int16x8_t q2s16, q3s16, q13s16;
20  int8x8_t d8s8, d9s8;
21  int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
22
23  qblimit = vdupq_n_u8(*blimit);
24
25  sp = s - (p << 1);
26  q5u8 = vld1q_u8(sp);
27  sp += p;
28  q6u8 = vld1q_u8(sp);
29  sp += p;
30  q7u8 = vld1q_u8(sp);
31  sp += p;
32  q8u8 = vld1q_u8(sp);
33
34  q15u8 = vabdq_u8(q6u8, q7u8);
35  q14u8 = vabdq_u8(q5u8, q8u8);
36
37  q15u8 = vqaddq_u8(q15u8, q15u8);
38  q14u8 = vshrq_n_u8(q14u8, 1);
39  q0u8 = vdupq_n_u8(0x80);
40  q13s16 = vdupq_n_s16(3);
41  q15u8 = vqaddq_u8(q15u8, q14u8);
42
43  q5u8 = veorq_u8(q5u8, q0u8);
44  q6u8 = veorq_u8(q6u8, q0u8);
45  q7u8 = veorq_u8(q7u8, q0u8);
46  q8u8 = veorq_u8(q8u8, q0u8);
47
48  q15u8 = vcgeq_u8(qblimit, q15u8);
49
50  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
51                   vget_low_s8(vreinterpretq_s8_u8(q6u8)));
52  q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
53                   vget_high_s8(vreinterpretq_s8_u8(q6u8)));
54
55  q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8));
56
57  q2s16 = vmulq_s16(q2s16, q13s16);
58  q3s16 = vmulq_s16(q3s16, q13s16);
59
60  q10u8 = vdupq_n_u8(3);
61  q9u8 = vdupq_n_u8(4);
62
63  q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
64  q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
65
66  d8s8 = vqmovn_s16(q2s16);
67  d9s8 = vqmovn_s16(q3s16);
68  q4s8 = vcombine_s8(d8s8, d9s8);
69
70  q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
71
72  q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
73  q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
74  q2s8 = vshrq_n_s8(q2s8, 3);
75  q3s8 = vshrq_n_s8(q3s8, 3);
76
77  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
78  q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
79
80  q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
81  q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
82
83  vst1q_u8(s, q7u8);
84  s -= p;
85  vst1q_u8(s, q6u8);
86  return;
87}
88
89void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride,
90                              const unsigned char *blimit) {
91  y_ptr += y_stride * 4;
92  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
93  y_ptr += y_stride * 4;
94  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
95  y_ptr += y_stride * 4;
96  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
97  return;
98}
99
100void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride,
101                               const unsigned char *blimit) {
102  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
103  return;
104}
105