1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14 15#include "vpx/vpx_integer.h" 16#include "vpx_dsp/arm/mem_neon.h" 17#include "vpx_dsp/arm/sum_neon.h" 18 19uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, 20 const uint8_t *ref_ptr, int ref_stride) { 21 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 22 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 23 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); 24 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 25 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 26} 27 28uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, 29 const uint8_t *ref_ptr, int ref_stride, 30 const uint8_t *second_pred) { 31 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 32 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 33 const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); 34 const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); 35 uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg)); 36 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); 37 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 38} 39 40uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, 41 const uint8_t *ref_ptr, int ref_stride) { 42 int i; 43 uint16x8_t abs = vdupq_n_u16(0); 44 for (i = 0; i < 8; i += 4) { 45 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 46 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 47 src_ptr += 4 * src_stride; 48 ref_ptr += 4 * ref_stride; 49 abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8)); 50 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); 51 } 52 53 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 54} 55 56uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, 57 const uint8_t *ref_ptr, int ref_stride, 58 const uint8_t *second_pred) { 59 int i; 60 uint16x8_t abs = vdupq_n_u16(0); 61 for (i = 0; i < 8; i += 4) { 62 const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride); 63 const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride); 64 const uint8x16_t second_pred_u8 = vld1q_u8(second_pred); 65 const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8); 66 src_ptr += 4 * src_stride; 67 ref_ptr += 4 * ref_stride; 68 second_pred += 16; 69 abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg)); 70 abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg)); 71 } 72 73 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); 74} 75 76static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b, 77 int b_stride, const int height) { 78 int i; 79 uint16x8_t abs = vdupq_n_u16(0); 80 81 for (i = 0; i < height; ++i) { 82 const uint8x8_t a_u8 = vld1_u8(a); 83 const uint8x8_t b_u8 = vld1_u8(b); 84 a += a_stride; 85 b += b_stride; 86 abs = vabal_u8(abs, a_u8, b_u8); 87 } 88 return abs; 89} 90 91static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride, 92 const uint8_t *b, int b_stride, 93 const uint8_t *c, const int height) { 94 int i; 95 uint16x8_t abs = vdupq_n_u16(0); 96 97 for (i = 0; i < height; ++i) { 98 const uint8x8_t a_u8 = vld1_u8(a); 99 const uint8x8_t b_u8 = vld1_u8(b); 100 const uint8x8_t c_u8 = vld1_u8(c); 101 const uint8x8_t avg = vrhadd_u8(b_u8, c_u8); 102 a += a_stride; 103 b += b_stride; 104 c += 8; 105 abs = vabal_u8(abs, a_u8, avg); 106 } 107 return abs; 108} 109 110#define sad8xN(n) \ 111 uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \ 112 const uint8_t *ref, int ref_stride) { \ 113 const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \ 114 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 115 } \ 116 \ 117 uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \ 118 const uint8_t *ref, int ref_stride, \ 119 const uint8_t *second_pred) { \ 120 const uint16x8_t abs = \ 121 sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ 122 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 123 } 124 125sad8xN(4); 126sad8xN(8); 127sad8xN(16); 128 129static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride, 130 const uint8_t *b, int b_stride, 131 const int height) { 132 int i; 133 uint16x8_t abs = vdupq_n_u16(0); 134 135 for (i = 0; i < height; ++i) { 136 const uint8x16_t a_u8 = vld1q_u8(a); 137 const uint8x16_t b_u8 = vld1q_u8(b); 138 a += a_stride; 139 b += b_stride; 140 abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8)); 141 abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8)); 142 } 143 return abs; 144} 145 146static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride, 147 const uint8_t *b, int b_stride, 148 const uint8_t *c, const int height) { 149 int i; 150 uint16x8_t abs = vdupq_n_u16(0); 151 152 for (i = 0; i < height; ++i) { 153 const uint8x16_t a_u8 = vld1q_u8(a); 154 const uint8x16_t b_u8 = vld1q_u8(b); 155 const uint8x16_t c_u8 = vld1q_u8(c); 156 const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8); 157 a += a_stride; 158 b += b_stride; 159 c += 16; 160 abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg)); 161 abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg)); 162 } 163 return abs; 164} 165 166#define sad16xN(n) \ 167 uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \ 168 const uint8_t *ref, int ref_stride) { \ 169 const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \ 170 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 171 } \ 172 \ 173 uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \ 174 const uint8_t *ref, int ref_stride, \ 175 const uint8_t *second_pred) { \ 176 const uint16x8_t abs = \ 177 sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ 178 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 179 } 180 181sad16xN(8); 182sad16xN(16); 183sad16xN(32); 184 185static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride, 186 const uint8_t *b, int b_stride, 187 const int height) { 188 int i; 189 uint16x8_t abs = vdupq_n_u16(0); 190 191 for (i = 0; i < height; ++i) { 192 const uint8x16_t a_lo = vld1q_u8(a); 193 const uint8x16_t a_hi = vld1q_u8(a + 16); 194 const uint8x16_t b_lo = vld1q_u8(b); 195 const uint8x16_t b_hi = vld1q_u8(b + 16); 196 a += a_stride; 197 b += b_stride; 198 abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo)); 199 abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo)); 200 abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi)); 201 abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi)); 202 } 203 return abs; 204} 205 206static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride, 207 const uint8_t *b, int b_stride, 208 const uint8_t *c, const int height) { 209 int i; 210 uint16x8_t abs = vdupq_n_u16(0); 211 212 for (i = 0; i < height; ++i) { 213 const uint8x16_t a_lo = vld1q_u8(a); 214 const uint8x16_t a_hi = vld1q_u8(a + 16); 215 const uint8x16_t b_lo = vld1q_u8(b); 216 const uint8x16_t b_hi = vld1q_u8(b + 16); 217 const uint8x16_t c_lo = vld1q_u8(c); 218 const uint8x16_t c_hi = vld1q_u8(c + 16); 219 const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo); 220 const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi); 221 a += a_stride; 222 b += b_stride; 223 c += 32; 224 abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo)); 225 abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo)); 226 abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi)); 227 abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi)); 228 } 229 return abs; 230} 231 232#define sad32xN(n) \ 233 uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \ 234 const uint8_t *ref, int ref_stride) { \ 235 const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \ 236 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 237 } \ 238 \ 239 uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \ 240 const uint8_t *ref, int ref_stride, \ 241 const uint8_t *second_pred) { \ 242 const uint16x8_t abs = \ 243 sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ 244 return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \ 245 } 246 247sad32xN(16); 248sad32xN(32); 249sad32xN(64); 250 251static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride, 252 const uint8_t *b, int b_stride, 253 const int height) { 254 int i; 255 uint16x8_t abs_0 = vdupq_n_u16(0); 256 uint16x8_t abs_1 = vdupq_n_u16(0); 257 258 for (i = 0; i < height; ++i) { 259 const uint8x16_t a_0 = vld1q_u8(a); 260 const uint8x16_t a_1 = vld1q_u8(a + 16); 261 const uint8x16_t a_2 = vld1q_u8(a + 32); 262 const uint8x16_t a_3 = vld1q_u8(a + 48); 263 const uint8x16_t b_0 = vld1q_u8(b); 264 const uint8x16_t b_1 = vld1q_u8(b + 16); 265 const uint8x16_t b_2 = vld1q_u8(b + 32); 266 const uint8x16_t b_3 = vld1q_u8(b + 48); 267 a += a_stride; 268 b += b_stride; 269 abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0)); 270 abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0)); 271 abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1)); 272 abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1)); 273 abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2)); 274 abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2)); 275 abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3)); 276 abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3)); 277 } 278 279 { 280 const uint32x4_t sum = vpaddlq_u16(abs_0); 281 return vpadalq_u16(sum, abs_1); 282 } 283} 284 285static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride, 286 const uint8_t *b, int b_stride, 287 const uint8_t *c, const int height) { 288 int i; 289 uint16x8_t abs_0 = vdupq_n_u16(0); 290 uint16x8_t abs_1 = vdupq_n_u16(0); 291 292 for (i = 0; i < height; ++i) { 293 const uint8x16_t a_0 = vld1q_u8(a); 294 const uint8x16_t a_1 = vld1q_u8(a + 16); 295 const uint8x16_t a_2 = vld1q_u8(a + 32); 296 const uint8x16_t a_3 = vld1q_u8(a + 48); 297 const uint8x16_t b_0 = vld1q_u8(b); 298 const uint8x16_t b_1 = vld1q_u8(b + 16); 299 const uint8x16_t b_2 = vld1q_u8(b + 32); 300 const uint8x16_t b_3 = vld1q_u8(b + 48); 301 const uint8x16_t c_0 = vld1q_u8(c); 302 const uint8x16_t c_1 = vld1q_u8(c + 16); 303 const uint8x16_t c_2 = vld1q_u8(c + 32); 304 const uint8x16_t c_3 = vld1q_u8(c + 48); 305 const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0); 306 const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1); 307 const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2); 308 const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3); 309 a += a_stride; 310 b += b_stride; 311 c += 64; 312 abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0)); 313 abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0)); 314 abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1)); 315 abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1)); 316 abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2)); 317 abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2)); 318 abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3)); 319 abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3)); 320 } 321 322 { 323 const uint32x4_t sum = vpaddlq_u16(abs_0); 324 return vpadalq_u16(sum, abs_1); 325 } 326} 327 328#define sad64xN(n) \ 329 uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \ 330 const uint8_t *ref, int ref_stride) { \ 331 const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \ 332 return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ 333 } \ 334 \ 335 uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \ 336 const uint8_t *ref, int ref_stride, \ 337 const uint8_t *second_pred) { \ 338 const uint32x4_t abs = \ 339 sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \ 340 return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \ 341 } 342 343sad64xN(32); 344sad64xN(64); 345