1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <arm_neon.h>
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "./vpx_config.h"
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
157bc9febe8749e98a3812a0dc4380ceae75c29450Johann    unsigned char *s, int p, const unsigned char *blimit) {
167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *sp;
177bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t qblimit, q0u8;
187bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
197bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int16x8_t q2s16, q3s16, q13s16;
207bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int8x8_t d8s8, d9s8;
217bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  qblimit = vdupq_n_u8(*blimit);
247bc9febe8749e98a3812a0dc4380ceae75c29450Johann
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sp = s - (p << 1);
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u8 = vld1q_u8(sp);
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sp += p;
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = vld1q_u8(sp);
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sp += p;
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u8 = vld1q_u8(sp);
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  sp += p;
327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u8 = vld1q_u8(sp);
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q15u8 = vabdq_u8(q6u8, q7u8);
357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q14u8 = vabdq_u8(q5u8, q8u8);
367bc9febe8749e98a3812a0dc4380ceae75c29450Johann
377bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q15u8 = vqaddq_u8(q15u8, q15u8);
387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q14u8 = vshrq_n_u8(q14u8, 1);
397bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q0u8 = vdupq_n_u8(0x80);
407bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q13s16 = vdupq_n_s16(3);
417bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q15u8 = vqaddq_u8(q15u8, q14u8);
427bc9febe8749e98a3812a0dc4380ceae75c29450Johann
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q5u8 = veorq_u8(q5u8, q0u8);
447bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = veorq_u8(q6u8, q0u8);
457bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u8 = veorq_u8(q7u8, q0u8);
467bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q8u8 = veorq_u8(q8u8, q0u8);
477bc9febe8749e98a3812a0dc4380ceae75c29450Johann
487bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q15u8 = vcgeq_u8(qblimit, q15u8);
497bc9febe8749e98a3812a0dc4380ceae75c29450Johann
507bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
517bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   vget_low_s8(vreinterpretq_s8_u8(q6u8)));
527bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                   vget_high_s8(vreinterpretq_s8_u8(q6u8)));
547bc9febe8749e98a3812a0dc4380ceae75c29450Johann
557bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8));
567bc9febe8749e98a3812a0dc4380ceae75c29450Johann
577bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q2s16 = vmulq_s16(q2s16, q13s16);
587bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vmulq_s16(q3s16, q13s16);
597bc9febe8749e98a3812a0dc4380ceae75c29450Johann
607bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10u8 = vdupq_n_u8(3);
617bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q9u8 = vdupq_n_u8(4);
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann
637bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
647bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
657bc9febe8749e98a3812a0dc4380ceae75c29450Johann
667bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d8s8 = vqmovn_s16(q2s16);
677bc9febe8749e98a3812a0dc4380ceae75c29450Johann  d9s8 = vqmovn_s16(q3s16);
687bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q4s8 = vcombine_s8(d8s8, d9s8);
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann
707bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q2s8 = vshrq_n_s8(q2s8, 3);
757bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q3s8 = vshrq_n_s8(q3s8, 3);
767bc9febe8749e98a3812a0dc4380ceae75c29450Johann
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
787bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
827bc9febe8749e98a3812a0dc4380ceae75c29450Johann
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u8(s, q7u8);
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  s -= p;
857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vst1q_u8(s, q6u8);
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
897bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride,
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              const unsigned char *blimit) {
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y_ptr += y_stride * 4;
927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y_ptr += y_stride * 4;
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann  y_ptr += y_stride * 4;
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
98ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
99ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride,
1017bc9febe8749e98a3812a0dc4380ceae75c29450Johann                               const unsigned char *blimit) {
1027bc9febe8749e98a3812a0dc4380ceae75c29450Johann  vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
1037bc9febe8749e98a3812a0dc4380ceae75c29450Johann  return;
104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
105