1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <arm_neon.h>
12
13#include "./vpx_config.h"
14
15#include "vpx/vpx_integer.h"
16#include "vpx_dsp/arm/mem_neon.h"
17#include "vpx_dsp/arm/sum_neon.h"
18
19uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
20                         const uint8_t *ref_ptr, int ref_stride) {
21  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
22  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
23  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
24  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
25  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
26}
27
28uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
29                             const uint8_t *ref_ptr, int ref_stride,
30                             const uint8_t *second_pred) {
31  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
32  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
33  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
34  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
35  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
36  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
37  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
38}
39
40uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
41                         const uint8_t *ref_ptr, int ref_stride) {
42  int i;
43  uint16x8_t abs = vdupq_n_u16(0);
44  for (i = 0; i < 8; i += 4) {
45    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
46    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
47    src_ptr += 4 * src_stride;
48    ref_ptr += 4 * ref_stride;
49    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
50    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
51  }
52
53  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
54}
55
56uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
57                             const uint8_t *ref_ptr, int ref_stride,
58                             const uint8_t *second_pred) {
59  int i;
60  uint16x8_t abs = vdupq_n_u16(0);
61  for (i = 0; i < 8; i += 4) {
62    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
63    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
64    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
65    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
66    src_ptr += 4 * src_stride;
67    ref_ptr += 4 * ref_stride;
68    second_pred += 16;
69    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
70    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
71  }
72
73  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
74}
75
76static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
77                               int b_stride, const int height) {
78  int i;
79  uint16x8_t abs = vdupq_n_u16(0);
80
81  for (i = 0; i < height; ++i) {
82    const uint8x8_t a_u8 = vld1_u8(a);
83    const uint8x8_t b_u8 = vld1_u8(b);
84    a += a_stride;
85    b += b_stride;
86    abs = vabal_u8(abs, a_u8, b_u8);
87  }
88  return abs;
89}
90
91static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
92                                   const uint8_t *b, int b_stride,
93                                   const uint8_t *c, const int height) {
94  int i;
95  uint16x8_t abs = vdupq_n_u16(0);
96
97  for (i = 0; i < height; ++i) {
98    const uint8x8_t a_u8 = vld1_u8(a);
99    const uint8x8_t b_u8 = vld1_u8(b);
100    const uint8x8_t c_u8 = vld1_u8(c);
101    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
102    a += a_stride;
103    b += b_stride;
104    c += 8;
105    abs = vabal_u8(abs, a_u8, avg);
106  }
107  return abs;
108}
109
110#define sad8xN(n)                                                      \
111  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
112                               const uint8_t *ref, int ref_stride) {   \
113    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
114    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
115  }                                                                    \
116                                                                       \
117  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
118                                   const uint8_t *ref, int ref_stride, \
119                                   const uint8_t *second_pred) {       \
120    const uint16x8_t abs =                                             \
121        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
122    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
123  }
124
125sad8xN(4);
126sad8xN(8);
127sad8xN(16);
128
129static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
130                                const uint8_t *b, int b_stride,
131                                const int height) {
132  int i;
133  uint16x8_t abs = vdupq_n_u16(0);
134
135  for (i = 0; i < height; ++i) {
136    const uint8x16_t a_u8 = vld1q_u8(a);
137    const uint8x16_t b_u8 = vld1q_u8(b);
138    a += a_stride;
139    b += b_stride;
140    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
141    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
142  }
143  return abs;
144}
145
146static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
147                                    const uint8_t *b, int b_stride,
148                                    const uint8_t *c, const int height) {
149  int i;
150  uint16x8_t abs = vdupq_n_u16(0);
151
152  for (i = 0; i < height; ++i) {
153    const uint8x16_t a_u8 = vld1q_u8(a);
154    const uint8x16_t b_u8 = vld1q_u8(b);
155    const uint8x16_t c_u8 = vld1q_u8(c);
156    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
157    a += a_stride;
158    b += b_stride;
159    c += 16;
160    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
161    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
162  }
163  return abs;
164}
165
166#define sad16xN(n)                                                      \
167  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
168                                const uint8_t *ref, int ref_stride) {   \
169    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
170    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
171  }                                                                     \
172                                                                        \
173  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
174                                    const uint8_t *ref, int ref_stride, \
175                                    const uint8_t *second_pred) {       \
176    const uint16x8_t abs =                                              \
177        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
178    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
179  }
180
181sad16xN(8);
182sad16xN(16);
183sad16xN(32);
184
185static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
186                                const uint8_t *b, int b_stride,
187                                const int height) {
188  int i;
189  uint16x8_t abs = vdupq_n_u16(0);
190
191  for (i = 0; i < height; ++i) {
192    const uint8x16_t a_lo = vld1q_u8(a);
193    const uint8x16_t a_hi = vld1q_u8(a + 16);
194    const uint8x16_t b_lo = vld1q_u8(b);
195    const uint8x16_t b_hi = vld1q_u8(b + 16);
196    a += a_stride;
197    b += b_stride;
198    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
199    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
200    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
201    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
202  }
203  return abs;
204}
205
206static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
207                                    const uint8_t *b, int b_stride,
208                                    const uint8_t *c, const int height) {
209  int i;
210  uint16x8_t abs = vdupq_n_u16(0);
211
212  for (i = 0; i < height; ++i) {
213    const uint8x16_t a_lo = vld1q_u8(a);
214    const uint8x16_t a_hi = vld1q_u8(a + 16);
215    const uint8x16_t b_lo = vld1q_u8(b);
216    const uint8x16_t b_hi = vld1q_u8(b + 16);
217    const uint8x16_t c_lo = vld1q_u8(c);
218    const uint8x16_t c_hi = vld1q_u8(c + 16);
219    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
220    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
221    a += a_stride;
222    b += b_stride;
223    c += 32;
224    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
225    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
226    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
227    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
228  }
229  return abs;
230}
231
232#define sad32xN(n)                                                      \
233  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
234                                const uint8_t *ref, int ref_stride) {   \
235    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
236    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
237  }                                                                     \
238                                                                        \
239  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
240                                    const uint8_t *ref, int ref_stride, \
241                                    const uint8_t *second_pred) {       \
242    const uint16x8_t abs =                                              \
243        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
244    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
245  }
246
247sad32xN(16);
248sad32xN(32);
249sad32xN(64);
250
251static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
252                                const uint8_t *b, int b_stride,
253                                const int height) {
254  int i;
255  uint16x8_t abs_0 = vdupq_n_u16(0);
256  uint16x8_t abs_1 = vdupq_n_u16(0);
257
258  for (i = 0; i < height; ++i) {
259    const uint8x16_t a_0 = vld1q_u8(a);
260    const uint8x16_t a_1 = vld1q_u8(a + 16);
261    const uint8x16_t a_2 = vld1q_u8(a + 32);
262    const uint8x16_t a_3 = vld1q_u8(a + 48);
263    const uint8x16_t b_0 = vld1q_u8(b);
264    const uint8x16_t b_1 = vld1q_u8(b + 16);
265    const uint8x16_t b_2 = vld1q_u8(b + 32);
266    const uint8x16_t b_3 = vld1q_u8(b + 48);
267    a += a_stride;
268    b += b_stride;
269    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
270    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
271    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
272    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
273    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
274    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
275    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
276    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
277  }
278
279  {
280    const uint32x4_t sum = vpaddlq_u16(abs_0);
281    return vpadalq_u16(sum, abs_1);
282  }
283}
284
285static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
286                                    const uint8_t *b, int b_stride,
287                                    const uint8_t *c, const int height) {
288  int i;
289  uint16x8_t abs_0 = vdupq_n_u16(0);
290  uint16x8_t abs_1 = vdupq_n_u16(0);
291
292  for (i = 0; i < height; ++i) {
293    const uint8x16_t a_0 = vld1q_u8(a);
294    const uint8x16_t a_1 = vld1q_u8(a + 16);
295    const uint8x16_t a_2 = vld1q_u8(a + 32);
296    const uint8x16_t a_3 = vld1q_u8(a + 48);
297    const uint8x16_t b_0 = vld1q_u8(b);
298    const uint8x16_t b_1 = vld1q_u8(b + 16);
299    const uint8x16_t b_2 = vld1q_u8(b + 32);
300    const uint8x16_t b_3 = vld1q_u8(b + 48);
301    const uint8x16_t c_0 = vld1q_u8(c);
302    const uint8x16_t c_1 = vld1q_u8(c + 16);
303    const uint8x16_t c_2 = vld1q_u8(c + 32);
304    const uint8x16_t c_3 = vld1q_u8(c + 48);
305    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
306    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
307    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
308    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
309    a += a_stride;
310    b += b_stride;
311    c += 64;
312    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
313    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
314    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
315    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
316    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
317    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
318    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
319    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
320  }
321
322  {
323    const uint32x4_t sum = vpaddlq_u16(abs_0);
324    return vpadalq_u16(sum, abs_1);
325  }
326}
327
328#define sad64xN(n)                                                      \
329  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
330                                const uint8_t *ref, int ref_stride) {   \
331    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
332    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
333  }                                                                     \
334                                                                        \
335  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
336                                    const uint8_t *ref, int ref_stride, \
337                                    const uint8_t *second_pred) {       \
338    const uint32x4_t abs =                                              \
339        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
340    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
341  }
342
343sad64xN(32);
344sad64xN(64);
345