1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13unsigned int vp8_sad8x8_neon(
14        unsigned char *src_ptr,
15        int src_stride,
16        unsigned char *ref_ptr,
17        int ref_stride) {
18    uint8x8_t d0, d8;
19    uint16x8_t q12;
20    uint32x4_t q1;
21    uint64x2_t q3;
22    uint32x2_t d5;
23    int i;
24
25    d0 = vld1_u8(src_ptr);
26    src_ptr += src_stride;
27    d8 = vld1_u8(ref_ptr);
28    ref_ptr += ref_stride;
29    q12 = vabdl_u8(d0, d8);
30
31    for (i = 0; i < 7; i++) {
32        d0 = vld1_u8(src_ptr);
33        src_ptr += src_stride;
34        d8 = vld1_u8(ref_ptr);
35        ref_ptr += ref_stride;
36        q12 = vabal_u8(q12, d0, d8);
37    }
38
39    q1 = vpaddlq_u16(q12);
40    q3 = vpaddlq_u32(q1);
41    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
42                  vreinterpret_u32_u64(vget_high_u64(q3)));
43
44    return vget_lane_u32(d5, 0);
45}
46
47unsigned int vp8_sad8x16_neon(
48        unsigned char *src_ptr,
49        int src_stride,
50        unsigned char *ref_ptr,
51        int ref_stride) {
52    uint8x8_t d0, d8;
53    uint16x8_t q12;
54    uint32x4_t q1;
55    uint64x2_t q3;
56    uint32x2_t d5;
57    int i;
58
59    d0 = vld1_u8(src_ptr);
60    src_ptr += src_stride;
61    d8 = vld1_u8(ref_ptr);
62    ref_ptr += ref_stride;
63    q12 = vabdl_u8(d0, d8);
64
65    for (i = 0; i < 15; i++) {
66        d0 = vld1_u8(src_ptr);
67        src_ptr += src_stride;
68        d8 = vld1_u8(ref_ptr);
69        ref_ptr += ref_stride;
70        q12 = vabal_u8(q12, d0, d8);
71    }
72
73    q1 = vpaddlq_u16(q12);
74    q3 = vpaddlq_u32(q1);
75    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
76                  vreinterpret_u32_u64(vget_high_u64(q3)));
77
78    return vget_lane_u32(d5, 0);
79}
80
81unsigned int vp8_sad4x4_neon(
82        unsigned char *src_ptr,
83        int src_stride,
84        unsigned char *ref_ptr,
85        int ref_stride) {
86    uint8x8_t d0, d8;
87    uint16x8_t q12;
88    uint32x2_t d1;
89    uint64x1_t d3;
90    int i;
91
92    d0 = vld1_u8(src_ptr);
93    src_ptr += src_stride;
94    d8 = vld1_u8(ref_ptr);
95    ref_ptr += ref_stride;
96    q12 = vabdl_u8(d0, d8);
97
98    for (i = 0; i < 3; i++) {
99        d0 = vld1_u8(src_ptr);
100        src_ptr += src_stride;
101        d8 = vld1_u8(ref_ptr);
102        ref_ptr += ref_stride;
103        q12 = vabal_u8(q12, d0, d8);
104    }
105
106    d1 = vpaddl_u16(vget_low_u16(q12));
107    d3 = vpaddl_u32(d1);
108
109    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
110}
111
112unsigned int vp8_sad16x16_neon(
113        unsigned char *src_ptr,
114        int src_stride,
115        unsigned char *ref_ptr,
116        int ref_stride) {
117    uint8x16_t q0, q4;
118    uint16x8_t q12, q13;
119    uint32x4_t q1;
120    uint64x2_t q3;
121    uint32x2_t d5;
122    int i;
123
124    q0 = vld1q_u8(src_ptr);
125    src_ptr += src_stride;
126    q4 = vld1q_u8(ref_ptr);
127    ref_ptr += ref_stride;
128    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
129    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
130
131    for (i = 0; i < 15; i++) {
132        q0 = vld1q_u8(src_ptr);
133        src_ptr += src_stride;
134        q4 = vld1q_u8(ref_ptr);
135        ref_ptr += ref_stride;
136        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
137        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
138    }
139
140    q12 = vaddq_u16(q12, q13);
141    q1 = vpaddlq_u16(q12);
142    q3 = vpaddlq_u32(q1);
143    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
144                  vreinterpret_u32_u64(vget_high_u64(q3)));
145
146    return vget_lane_u32(d5, 0);
147}
148
149unsigned int vp8_sad16x8_neon(
150        unsigned char *src_ptr,
151        int src_stride,
152        unsigned char *ref_ptr,
153        int ref_stride) {
154    uint8x16_t q0, q4;
155    uint16x8_t q12, q13;
156    uint32x4_t q1;
157    uint64x2_t q3;
158    uint32x2_t d5;
159    int i;
160
161    q0 = vld1q_u8(src_ptr);
162    src_ptr += src_stride;
163    q4 = vld1q_u8(ref_ptr);
164    ref_ptr += ref_stride;
165    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
166    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
167
168    for (i = 0; i < 7; i++) {
169        q0 = vld1q_u8(src_ptr);
170        src_ptr += src_stride;
171        q4 = vld1q_u8(ref_ptr);
172        ref_ptr += ref_stride;
173        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
174        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
175    }
176
177    q12 = vaddq_u16(q12, q13);
178    q1 = vpaddlq_u16(q12);
179    q3 = vpaddlq_u32(q1);
180    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
181                  vreinterpret_u32_u64(vget_high_u64(q3)));
182
183    return vget_lane_u32(d5, 0);
184}
185