sad_neon.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14
15#include "vpx/vpx_integer.h"
16
17unsigned int vpx_sad8x16_neon(
18        unsigned char *src_ptr,
19        int src_stride,
20        unsigned char *ref_ptr,
21        int ref_stride) {
22    uint8x8_t d0, d8;
23    uint16x8_t q12;
24    uint32x4_t q1;
25    uint64x2_t q3;
26    uint32x2_t d5;
27    int i;
28
29    d0 = vld1_u8(src_ptr);
30    src_ptr += src_stride;
31    d8 = vld1_u8(ref_ptr);
32    ref_ptr += ref_stride;
33    q12 = vabdl_u8(d0, d8);
34
35    for (i = 0; i < 15; i++) {
36        d0 = vld1_u8(src_ptr);
37        src_ptr += src_stride;
38        d8 = vld1_u8(ref_ptr);
39        ref_ptr += ref_stride;
40        q12 = vabal_u8(q12, d0, d8);
41    }
42
43    q1 = vpaddlq_u16(q12);
44    q3 = vpaddlq_u32(q1);
45    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
46                  vreinterpret_u32_u64(vget_high_u64(q3)));
47
48    return vget_lane_u32(d5, 0);
49}
50
51unsigned int vpx_sad4x4_neon(
52        unsigned char *src_ptr,
53        int src_stride,
54        unsigned char *ref_ptr,
55        int ref_stride) {
56    uint8x8_t d0, d8;
57    uint16x8_t q12;
58    uint32x2_t d1;
59    uint64x1_t d3;
60    int i;
61
62    d0 = vld1_u8(src_ptr);
63    src_ptr += src_stride;
64    d8 = vld1_u8(ref_ptr);
65    ref_ptr += ref_stride;
66    q12 = vabdl_u8(d0, d8);
67
68    for (i = 0; i < 3; i++) {
69        d0 = vld1_u8(src_ptr);
70        src_ptr += src_stride;
71        d8 = vld1_u8(ref_ptr);
72        ref_ptr += ref_stride;
73        q12 = vabal_u8(q12, d0, d8);
74    }
75
76    d1 = vpaddl_u16(vget_low_u16(q12));
77    d3 = vpaddl_u32(d1);
78
79    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
80}
81
82unsigned int vpx_sad16x8_neon(
83        unsigned char *src_ptr,
84        int src_stride,
85        unsigned char *ref_ptr,
86        int ref_stride) {
87    uint8x16_t q0, q4;
88    uint16x8_t q12, q13;
89    uint32x4_t q1;
90    uint64x2_t q3;
91    uint32x2_t d5;
92    int i;
93
94    q0 = vld1q_u8(src_ptr);
95    src_ptr += src_stride;
96    q4 = vld1q_u8(ref_ptr);
97    ref_ptr += ref_stride;
98    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
99    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
100
101    for (i = 0; i < 7; i++) {
102        q0 = vld1q_u8(src_ptr);
103        src_ptr += src_stride;
104        q4 = vld1q_u8(ref_ptr);
105        ref_ptr += ref_stride;
106        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
107        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
108    }
109
110    q12 = vaddq_u16(q12, q13);
111    q1 = vpaddlq_u16(q12);
112    q3 = vpaddlq_u32(q1);
113    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
114                  vreinterpret_u32_u64(vget_high_u64(q3)));
115
116    return vget_lane_u32(d5, 0);
117}
118
119static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
120                                                    const uint16x8_t vec_hi) {
121  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
122                                        vget_high_u16(vec_lo));
123  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
124                                        vget_high_u16(vec_hi));
125  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
126  const uint64x2_t b = vpaddlq_u32(a);
127  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
128                                vreinterpret_u32_u64(vget_high_u64(b)));
129  return vget_lane_u32(c, 0);
130}
131static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
132  const uint32x4_t a = vpaddlq_u16(vec_16x8);
133  const uint64x2_t b = vpaddlq_u32(a);
134  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
135                                vreinterpret_u32_u64(vget_high_u64(b)));
136  return vget_lane_u32(c, 0);
137}
138
139unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
140                               const uint8_t *ref, int ref_stride) {
141  int i;
142  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
143  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
144  for (i = 0; i < 64; ++i) {
145    const uint8x16_t vec_src_00 = vld1q_u8(src);
146    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
147    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
148    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
149    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
150    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
151    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
152    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
153    src += src_stride;
154    ref += ref_stride;
155    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
156                            vget_low_u8(vec_ref_00));
157    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
158                            vget_high_u8(vec_ref_00));
159    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
160                            vget_low_u8(vec_ref_16));
161    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
162                            vget_high_u8(vec_ref_16));
163    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
164                            vget_low_u8(vec_ref_32));
165    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
166                            vget_high_u8(vec_ref_32));
167    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
168                            vget_low_u8(vec_ref_48));
169    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
170                            vget_high_u8(vec_ref_48));
171  }
172  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
173}
174
175unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
176                               const uint8_t *ref, int ref_stride) {
177  int i;
178  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
179  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
180
181  for (i = 0; i < 32; ++i) {
182    const uint8x16_t vec_src_00 = vld1q_u8(src);
183    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
184    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
185    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
186    src += src_stride;
187    ref += ref_stride;
188    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
189                            vget_low_u8(vec_ref_00));
190    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
191                            vget_high_u8(vec_ref_00));
192    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
193                            vget_low_u8(vec_ref_16));
194    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
195                            vget_high_u8(vec_ref_16));
196  }
197  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
198}
199
200unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
201                               const uint8_t *ref, int ref_stride) {
202  int i;
203  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
204  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
205
206  for (i = 0; i < 16; ++i) {
207    const uint8x16_t vec_src = vld1q_u8(src);
208    const uint8x16_t vec_ref = vld1q_u8(ref);
209    src += src_stride;
210    ref += ref_stride;
211    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
212                            vget_low_u8(vec_ref));
213    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
214                            vget_high_u8(vec_ref));
215  }
216  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
217}
218
219unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
220                             const uint8_t *ref, int ref_stride) {
221  int i;
222  uint16x8_t vec_accum = vdupq_n_u16(0);
223
224  for (i = 0; i < 8; ++i) {
225    const uint8x8_t vec_src = vld1_u8(src);
226    const uint8x8_t vec_ref = vld1_u8(ref);
227    src += src_stride;
228    ref += ref_stride;
229    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
230  }
231  return horizontal_add_16x8(vec_accum);
232}
233