1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS.  All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include <arm_neon.h>
12ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "./vp9_rtcd.h"
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "./vpx_config.h"
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "vpx/vpx_integer.h"
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid vp9_subtract_block_neon(int rows, int cols,
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             int16_t *diff, ptrdiff_t diff_stride,
19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             const uint8_t *src, ptrdiff_t src_stride,
20ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                             const uint8_t *pred, ptrdiff_t pred_stride) {
21ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  int r, c;
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  if (cols > 16) {
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (r = 0; r < rows; ++r) {
25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      for (c = 0; c < cols; c += 32) {
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);
29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                                 vget_low_u8(v_pred_00));
32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                                 vget_high_u8(v_pred_00));
34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                                 vget_low_u8(v_pred_16));
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                                 vget_high_u8(v_pred_16));
38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));
39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));
40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      }
43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      diff += diff_stride;
44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      pred += pred_stride;
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      src  += src_stride;
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  } else if (cols > 8) {
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (r = 0; r < rows; ++r) {
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint8x16_t v_src = vld1q_u8(&src[0]);
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                            vget_low_u8(v_pred));
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                            vget_high_u8(v_pred));
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      diff += diff_stride;
58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      pred += pred_stride;
59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      src  += src_stride;
60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  } else if (cols > 4) {
62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (r = 0; r < rows; ++r) {
63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint8x8_t v_src = vld1_u8(&src[0]);
64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint8x8_t v_pred = vld1_u8(&pred[0]);
65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      diff += diff_stride;
68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      pred += pred_stride;
69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      src  += src_stride;
70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  } else {
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    for (r = 0; r < rows; ++r) {
73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      for (c = 0; c < cols; ++c)
74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        diff[c] = src[c] - pred[c];
75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      diff += diff_stride;
77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      pred += pred_stride;
78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian      src  += src_stride;
79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    }
80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
82