1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12#include "vp8/encoder/block.h"
13
14void vp8_subtract_b_neon(
15        BLOCK *be,
16        BLOCKD *bd,
17        int pitch) {
18    unsigned char *src_ptr, *predictor;
19    int src_stride;
20    int16_t *src_diff;
21    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
22    uint16x8_t q10u16, q11u16, q12u16, q13u16;
23
24    src_ptr = *be->base_src + be->src;
25    src_stride = be->src_stride;
26    predictor = bd->predictor;
27
28    d0u8 = vld1_u8(src_ptr);
29    src_ptr += src_stride;
30    d2u8 = vld1_u8(src_ptr);
31    src_ptr += src_stride;
32    d4u8 = vld1_u8(src_ptr);
33    src_ptr += src_stride;
34    d6u8 = vld1_u8(src_ptr);
35
36    d1u8 = vld1_u8(predictor);
37    predictor += pitch;
38    d3u8 = vld1_u8(predictor);
39    predictor += pitch;
40    d5u8 = vld1_u8(predictor);
41    predictor += pitch;
42    d7u8 = vld1_u8(predictor);
43
44    q10u16 = vsubl_u8(d0u8, d1u8);
45    q11u16 = vsubl_u8(d2u8, d3u8);
46    q12u16 = vsubl_u8(d4u8, d5u8);
47    q13u16 = vsubl_u8(d6u8, d7u8);
48
49    src_diff = be->src_diff;
50    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
51    src_diff += pitch;
52    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
53    src_diff += pitch;
54    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
55    src_diff += pitch;
56    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
57    return;
58}
59
60void vp8_subtract_mby_neon(
61        int16_t *diff,
62        unsigned char *src,
63        int src_stride,
64        unsigned char *pred,
65        int pred_stride) {
66    int i;
67    uint8x16_t q0u8, q1u8, q2u8, q3u8;
68    uint16x8_t q8u16, q9u16, q10u16, q11u16;
69
70    for (i = 0; i < 8; i++) {  // subtract_mby_loop
71        q0u8 = vld1q_u8(src);
72        src += src_stride;
73        q2u8 = vld1q_u8(src);
74        src += src_stride;
75        q1u8 = vld1q_u8(pred);
76        pred += pred_stride;
77        q3u8 = vld1q_u8(pred);
78        pred += pred_stride;
79
80        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
81        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
82        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
83        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
84
85        vst1q_u16((uint16_t *)diff, q8u16);
86        diff += 8;
87        vst1q_u16((uint16_t *)diff, q9u16);
88        diff += 8;
89        vst1q_u16((uint16_t *)diff, q10u16);
90        diff += 8;
91        vst1q_u16((uint16_t *)diff, q11u16);
92        diff += 8;
93    }
94    return;
95}
96
97void vp8_subtract_mbuv_neon(
98        int16_t *diff,
99        unsigned char *usrc,
100        unsigned char *vsrc,
101        int src_stride,
102        unsigned char *upred,
103        unsigned char *vpred,
104        int pred_stride) {
105    int i, j;
106    unsigned char *src_ptr, *pred_ptr;
107    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
108    uint16x8_t q8u16, q9u16, q10u16, q11u16;
109
110    diff += 256;
111    for (i = 0; i < 2; i++) {
112        if (i == 0) {
113            src_ptr = usrc;
114            pred_ptr = upred;
115        } else if (i == 1) {
116            src_ptr = vsrc;
117            pred_ptr = vpred;
118        }
119
120        for (j = 0; j < 2; j++) {
121            d0u8 = vld1_u8(src_ptr);
122            src_ptr += src_stride;
123            d1u8 = vld1_u8(pred_ptr);
124            pred_ptr += pred_stride;
125            d2u8 = vld1_u8(src_ptr);
126            src_ptr += src_stride;
127            d3u8 = vld1_u8(pred_ptr);
128            pred_ptr += pred_stride;
129            d4u8 = vld1_u8(src_ptr);
130            src_ptr += src_stride;
131            d5u8 = vld1_u8(pred_ptr);
132            pred_ptr += pred_stride;
133            d6u8 = vld1_u8(src_ptr);
134            src_ptr += src_stride;
135            d7u8 = vld1_u8(pred_ptr);
136            pred_ptr += pred_stride;
137
138            q8u16  = vsubl_u8(d0u8, d1u8);
139            q9u16  = vsubl_u8(d2u8, d3u8);
140            q10u16 = vsubl_u8(d4u8, d5u8);
141            q11u16 = vsubl_u8(d6u8, d7u8);
142
143            vst1q_u16((uint16_t *)diff, q8u16);
144            diff += 8;
145            vst1q_u16((uint16_t *)diff, q9u16);
146            diff += 8;
147            vst1q_u16((uint16_t *)diff, q10u16);
148            diff += 8;
149            vst1q_u16((uint16_t *)diff, q11u16);
150            diff += 8;
151        }
152    }
153    return;
154}
155