1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "./vpx_config.h" 15 16#include "vpx/vpx_integer.h" 17#include "vpx_ports/mem.h" 18 19static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { 20 const int32x4_t a = vpaddlq_s16(v_16x8); 21 const int64x2_t b = vpaddlq_s32(a); 22 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 23 vreinterpret_s32_s64(vget_high_s64(b))); 24 return vget_lane_s32(c, 0); 25} 26 27static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { 28 const int64x2_t b = vpaddlq_s32(v_32x4); 29 const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), 30 vreinterpret_s32_s64(vget_high_s64(b))); 31 return vget_lane_s32(c, 0); 32} 33 34// w * h must be less than 2048 or local variable v_sum may overflow. 35static void variance_neon_w8(const uint8_t *a, int a_stride, 36 const uint8_t *b, int b_stride, 37 int w, int h, uint32_t *sse, int *sum) { 38 int i, j; 39 int16x8_t v_sum = vdupq_n_s16(0); 40 int32x4_t v_sse_lo = vdupq_n_s32(0); 41 int32x4_t v_sse_hi = vdupq_n_s32(0); 42 43 for (i = 0; i < h; ++i) { 44 for (j = 0; j < w; j += 8) { 45 const uint8x8_t v_a = vld1_u8(&a[j]); 46 const uint8x8_t v_b = vld1_u8(&b[j]); 47 const uint16x8_t v_diff = vsubl_u8(v_a, v_b); 48 const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); 49 v_sum = vaddq_s16(v_sum, sv_diff); 50 v_sse_lo = vmlal_s16(v_sse_lo, 51 vget_low_s16(sv_diff), 52 vget_low_s16(sv_diff)); 53 v_sse_hi = vmlal_s16(v_sse_hi, 54 vget_high_s16(sv_diff), 55 vget_high_s16(sv_diff)); 56 } 57 a += a_stride; 58 b += b_stride; 59 } 60 61 *sum = horizontal_add_s16x8(v_sum); 62 *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); 63} 64 65void vpx_get8x8var_neon(const uint8_t *a, int a_stride, 66 const uint8_t *b, int b_stride, 67 unsigned int *sse, int *sum) { 68 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); 69} 70 71void vpx_get16x16var_neon(const uint8_t *a, int a_stride, 72 const uint8_t *b, int b_stride, 73 unsigned int *sse, int *sum) { 74 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); 75} 76 77unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride, 78 const uint8_t *b, int b_stride, 79 unsigned int *sse) { 80 int sum; 81 variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); 82 return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8 83} 84 85unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride, 86 const uint8_t *b, int b_stride, 87 unsigned int *sse) { 88 int sum; 89 variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); 90 return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16 91} 92 93unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride, 94 const uint8_t *b, int b_stride, 95 unsigned int *sse) { 96 int sum; 97 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); 98 return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32 99} 100 101unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride, 102 const uint8_t *b, int b_stride, 103 unsigned int *sse) { 104 int sum1, sum2; 105 uint32_t sse1, sse2; 106 variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); 107 variance_neon_w8(a + (32 * a_stride), a_stride, 108 b + (32 * b_stride), b_stride, 32, 32, 109 &sse2, &sum2); 110 *sse = sse1 + sse2; 111 sum1 += sum2; 112 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 113} 114 115unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride, 116 const uint8_t *b, int b_stride, 117 unsigned int *sse) { 118 int sum1, sum2; 119 uint32_t sse1, sse2; 120 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); 121 variance_neon_w8(a + (16 * a_stride), a_stride, 122 b + (16 * b_stride), b_stride, 64, 16, 123 &sse2, &sum2); 124 *sse = sse1 + sse2; 125 sum1 += sum2; 126 return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64 127} 128 129unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride, 130 const uint8_t *b, int b_stride, 131 unsigned int *sse) { 132 int sum1, sum2; 133 uint32_t sse1, sse2; 134 135 variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); 136 variance_neon_w8(a + (16 * a_stride), a_stride, 137 b + (16 * b_stride), b_stride, 64, 16, 138 &sse2, &sum2); 139 sse1 += sse2; 140 sum1 += sum2; 141 142 variance_neon_w8(a + (16 * 2 * a_stride), a_stride, 143 b + (16 * 2 * b_stride), b_stride, 144 64, 16, &sse2, &sum2); 145 sse1 += sse2; 146 sum1 += sum2; 147 148 variance_neon_w8(a + (16 * 3 * a_stride), a_stride, 149 b + (16 * 3 * b_stride), b_stride, 150 64, 16, &sse2, &sum2); 151 *sse = sse1 + sse2; 152 sum1 += sum2; 153 return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64 154} 155 156unsigned int vpx_variance16x8_neon( 157 const unsigned char *src_ptr, 158 int source_stride, 159 const unsigned char *ref_ptr, 160 int recon_stride, 161 unsigned int *sse) { 162 int i; 163 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 164 uint32x2_t d0u32, d10u32; 165 int64x1_t d0s64, d1s64; 166 uint8x16_t q0u8, q1u8, q2u8, q3u8; 167 uint16x8_t q11u16, q12u16, q13u16, q14u16; 168 int32x4_t q8s32, q9s32, q10s32; 169 int64x2_t q0s64, q1s64, q5s64; 170 171 q8s32 = vdupq_n_s32(0); 172 q9s32 = vdupq_n_s32(0); 173 q10s32 = vdupq_n_s32(0); 174 175 for (i = 0; i < 4; i++) { 176 q0u8 = vld1q_u8(src_ptr); 177 src_ptr += source_stride; 178 q1u8 = vld1q_u8(src_ptr); 179 src_ptr += source_stride; 180 __builtin_prefetch(src_ptr); 181 182 q2u8 = vld1q_u8(ref_ptr); 183 ref_ptr += recon_stride; 184 q3u8 = vld1q_u8(ref_ptr); 185 ref_ptr += recon_stride; 186 __builtin_prefetch(ref_ptr); 187 188 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 189 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 190 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 191 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 192 193 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 194 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 195 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 196 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 197 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 198 199 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 200 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 201 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 202 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 203 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 204 205 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 206 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 207 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); 208 q9s32 = vmlal_s16(q9s32, d26s16, d26s16); 209 q10s32 = vmlal_s16(q10s32, d27s16, d27s16); 210 211 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 212 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 213 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); 214 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 215 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 216 } 217 218 q10s32 = vaddq_s32(q10s32, q9s32); 219 q0s64 = vpaddlq_s32(q8s32); 220 q1s64 = vpaddlq_s32(q10s32); 221 222 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 223 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 224 225 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 226 vreinterpret_s32_s64(d0s64)); 227 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 228 229 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 230 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 231 232 return vget_lane_u32(d0u32, 0); 233} 234 235unsigned int vpx_variance8x16_neon( 236 const unsigned char *src_ptr, 237 int source_stride, 238 const unsigned char *ref_ptr, 239 int recon_stride, 240 unsigned int *sse) { 241 int i; 242 uint8x8_t d0u8, d2u8, d4u8, d6u8; 243 int16x4_t d22s16, d23s16, d24s16, d25s16; 244 uint32x2_t d0u32, d10u32; 245 int64x1_t d0s64, d1s64; 246 uint16x8_t q11u16, q12u16; 247 int32x4_t q8s32, q9s32, q10s32; 248 int64x2_t q0s64, q1s64, q5s64; 249 250 q8s32 = vdupq_n_s32(0); 251 q9s32 = vdupq_n_s32(0); 252 q10s32 = vdupq_n_s32(0); 253 254 for (i = 0; i < 8; i++) { 255 d0u8 = vld1_u8(src_ptr); 256 src_ptr += source_stride; 257 d2u8 = vld1_u8(src_ptr); 258 src_ptr += source_stride; 259 __builtin_prefetch(src_ptr); 260 261 d4u8 = vld1_u8(ref_ptr); 262 ref_ptr += recon_stride; 263 d6u8 = vld1_u8(ref_ptr); 264 ref_ptr += recon_stride; 265 __builtin_prefetch(ref_ptr); 266 267 q11u16 = vsubl_u8(d0u8, d4u8); 268 q12u16 = vsubl_u8(d2u8, d6u8); 269 270 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 271 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 272 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); 273 q9s32 = vmlal_s16(q9s32, d22s16, d22s16); 274 q10s32 = vmlal_s16(q10s32, d23s16, d23s16); 275 276 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 277 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 278 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); 279 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 280 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 281 } 282 283 q10s32 = vaddq_s32(q10s32, q9s32); 284 q0s64 = vpaddlq_s32(q8s32); 285 q1s64 = vpaddlq_s32(q10s32); 286 287 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); 288 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 289 290 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), 291 vreinterpret_s32_s64(d0s64)); 292 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); 293 294 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); 295 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); 296 297 return vget_lane_u32(d0u32, 0); 298} 299 300unsigned int vpx_mse16x16_neon( 301 const unsigned char *src_ptr, 302 int source_stride, 303 const unsigned char *ref_ptr, 304 int recon_stride, 305 unsigned int *sse) { 306 int i; 307 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; 308 int64x1_t d0s64; 309 uint8x16_t q0u8, q1u8, q2u8, q3u8; 310 int32x4_t q7s32, q8s32, q9s32, q10s32; 311 uint16x8_t q11u16, q12u16, q13u16, q14u16; 312 int64x2_t q1s64; 313 314 q7s32 = vdupq_n_s32(0); 315 q8s32 = vdupq_n_s32(0); 316 q9s32 = vdupq_n_s32(0); 317 q10s32 = vdupq_n_s32(0); 318 319 for (i = 0; i < 8; i++) { // mse16x16_neon_loop 320 q0u8 = vld1q_u8(src_ptr); 321 src_ptr += source_stride; 322 q1u8 = vld1q_u8(src_ptr); 323 src_ptr += source_stride; 324 q2u8 = vld1q_u8(ref_ptr); 325 ref_ptr += recon_stride; 326 q3u8 = vld1q_u8(ref_ptr); 327 ref_ptr += recon_stride; 328 329 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); 330 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); 331 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); 332 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); 333 334 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); 335 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); 336 q7s32 = vmlal_s16(q7s32, d22s16, d22s16); 337 q8s32 = vmlal_s16(q8s32, d23s16, d23s16); 338 339 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); 340 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); 341 q9s32 = vmlal_s16(q9s32, d24s16, d24s16); 342 q10s32 = vmlal_s16(q10s32, d25s16, d25s16); 343 344 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); 345 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); 346 q7s32 = vmlal_s16(q7s32, d26s16, d26s16); 347 q8s32 = vmlal_s16(q8s32, d27s16, d27s16); 348 349 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); 350 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); 351 q9s32 = vmlal_s16(q9s32, d28s16, d28s16); 352 q10s32 = vmlal_s16(q10s32, d29s16, d29s16); 353 } 354 355 q7s32 = vaddq_s32(q7s32, q8s32); 356 q9s32 = vaddq_s32(q9s32, q10s32); 357 q10s32 = vaddq_s32(q7s32, q9s32); 358 359 q1s64 = vpaddlq_s32(q10s32); 360 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 361 362 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); 363 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); 364} 365 366unsigned int vpx_get4x4sse_cs_neon( 367 const unsigned char *src_ptr, 368 int source_stride, 369 const unsigned char *ref_ptr, 370 int recon_stride) { 371 int16x4_t d22s16, d24s16, d26s16, d28s16; 372 int64x1_t d0s64; 373 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; 374 int32x4_t q7s32, q8s32, q9s32, q10s32; 375 uint16x8_t q11u16, q12u16, q13u16, q14u16; 376 int64x2_t q1s64; 377 378 d0u8 = vld1_u8(src_ptr); 379 src_ptr += source_stride; 380 d4u8 = vld1_u8(ref_ptr); 381 ref_ptr += recon_stride; 382 d1u8 = vld1_u8(src_ptr); 383 src_ptr += source_stride; 384 d5u8 = vld1_u8(ref_ptr); 385 ref_ptr += recon_stride; 386 d2u8 = vld1_u8(src_ptr); 387 src_ptr += source_stride; 388 d6u8 = vld1_u8(ref_ptr); 389 ref_ptr += recon_stride; 390 d3u8 = vld1_u8(src_ptr); 391 src_ptr += source_stride; 392 d7u8 = vld1_u8(ref_ptr); 393 ref_ptr += recon_stride; 394 395 q11u16 = vsubl_u8(d0u8, d4u8); 396 q12u16 = vsubl_u8(d1u8, d5u8); 397 q13u16 = vsubl_u8(d2u8, d6u8); 398 q14u16 = vsubl_u8(d3u8, d7u8); 399 400 d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); 401 d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); 402 d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); 403 d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); 404 405 q7s32 = vmull_s16(d22s16, d22s16); 406 q8s32 = vmull_s16(d24s16, d24s16); 407 q9s32 = vmull_s16(d26s16, d26s16); 408 q10s32 = vmull_s16(d28s16, d28s16); 409 410 q7s32 = vaddq_s32(q7s32, q8s32); 411 q9s32 = vaddq_s32(q9s32, q10s32); 412 q9s32 = vaddq_s32(q7s32, q9s32); 413 414 q1s64 = vpaddlq_s32(q9s32); 415 d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); 416 417 return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); 418} 419