1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <arm_neon.h> 12 13#include "./vpx_config.h" 14#include "./vpx_dsp_rtcd.h" 15#include "vpx/vpx_integer.h" 16 17//------------------------------------------------------------------------------ 18// DC 4x4 19 20// 'do_above' and 'do_left' facilitate branch removal when inlined. 21static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, 22 const uint8_t *above, const uint8_t *left, 23 int do_above, int do_left) { 24 uint16x8_t sum_top; 25 uint16x8_t sum_left; 26 uint8x8_t dc0; 27 28 if (do_above) { 29 const uint8x8_t A = vld1_u8(above); // top row 30 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 31 const uint16x4_t p1 = vpadd_u16(p0, p0); 32 sum_top = vcombine_u16(p1, p1); 33 } 34 35 if (do_left) { 36 const uint8x8_t L = vld1_u8(left); // left border 37 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left 38 const uint16x4_t p1 = vpadd_u16(p0, p0); 39 sum_left = vcombine_u16(p1, p1); 40 } 41 42 if (do_above && do_left) { 43 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 44 dc0 = vrshrn_n_u16(sum, 3); 45 } else if (do_above) { 46 dc0 = vrshrn_n_u16(sum_top, 2); 47 } else if (do_left) { 48 dc0 = vrshrn_n_u16(sum_left, 2); 49 } else { 50 dc0 = vdup_n_u8(0x80); 51 } 52 53 { 54 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 55 int i; 56 for (i = 0; i < 4; ++i) { 57 vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); 58 } 59 } 60} 61 62void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 63 const uint8_t *above, const uint8_t *left) { 64 dc_4x4(dst, stride, above, left, 1, 1); 65} 66 67void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 68 const uint8_t *above, const uint8_t *left) { 69 (void)above; 70 dc_4x4(dst, stride, NULL, left, 0, 1); 71} 72 73void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 74 const uint8_t *above, const uint8_t *left) { 75 (void)left; 76 dc_4x4(dst, stride, above, NULL, 1, 0); 77} 78 79void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 80 const uint8_t *above, const uint8_t *left) { 81 (void)above; 82 (void)left; 83 dc_4x4(dst, stride, NULL, NULL, 0, 0); 84} 85 86//------------------------------------------------------------------------------ 87// DC 8x8 88 89// 'do_above' and 'do_left' facilitate branch removal when inlined. 90static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, 91 const uint8_t *above, const uint8_t *left, 92 int do_above, int do_left) { 93 uint16x8_t sum_top; 94 uint16x8_t sum_left; 95 uint8x8_t dc0; 96 97 if (do_above) { 98 const uint8x8_t A = vld1_u8(above); // top row 99 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top 100 const uint16x4_t p1 = vpadd_u16(p0, p0); 101 const uint16x4_t p2 = vpadd_u16(p1, p1); 102 sum_top = vcombine_u16(p2, p2); 103 } 104 105 if (do_left) { 106 const uint8x8_t L = vld1_u8(left); // left border 107 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left 108 const uint16x4_t p1 = vpadd_u16(p0, p0); 109 const uint16x4_t p2 = vpadd_u16(p1, p1); 110 sum_left = vcombine_u16(p2, p2); 111 } 112 113 if (do_above && do_left) { 114 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 115 dc0 = vrshrn_n_u16(sum, 4); 116 } else if (do_above) { 117 dc0 = vrshrn_n_u16(sum_top, 3); 118 } else if (do_left) { 119 dc0 = vrshrn_n_u16(sum_left, 3); 120 } else { 121 dc0 = vdup_n_u8(0x80); 122 } 123 124 { 125 const uint8x8_t dc = vdup_lane_u8(dc0, 0); 126 int i; 127 for (i = 0; i < 8; ++i) { 128 vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); 129 } 130 } 131} 132 133void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 134 const uint8_t *above, const uint8_t *left) { 135 dc_8x8(dst, stride, above, left, 1, 1); 136} 137 138void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 139 const uint8_t *above, const uint8_t *left) { 140 (void)above; 141 dc_8x8(dst, stride, NULL, left, 0, 1); 142} 143 144void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 145 const uint8_t *above, const uint8_t *left) { 146 (void)left; 147 dc_8x8(dst, stride, above, NULL, 1, 0); 148} 149 150void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 151 const uint8_t *above, const uint8_t *left) { 152 (void)above; 153 (void)left; 154 dc_8x8(dst, stride, NULL, NULL, 0, 0); 155} 156 157//------------------------------------------------------------------------------ 158// DC 16x16 159 160// 'do_above' and 'do_left' facilitate branch removal when inlined. 161static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, 162 const uint8_t *above, const uint8_t *left, 163 int do_above, int do_left) { 164 uint16x8_t sum_top; 165 uint16x8_t sum_left; 166 uint8x8_t dc0; 167 168 if (do_above) { 169 const uint8x16_t A = vld1q_u8(above); // top row 170 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top 171 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); 172 const uint16x4_t p2 = vpadd_u16(p1, p1); 173 const uint16x4_t p3 = vpadd_u16(p2, p2); 174 sum_top = vcombine_u16(p3, p3); 175 } 176 177 if (do_left) { 178 const uint8x16_t L = vld1q_u8(left); // left row 179 const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left 180 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); 181 const uint16x4_t p2 = vpadd_u16(p1, p1); 182 const uint16x4_t p3 = vpadd_u16(p2, p2); 183 sum_left = vcombine_u16(p3, p3); 184 } 185 186 if (do_above && do_left) { 187 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 188 dc0 = vrshrn_n_u16(sum, 5); 189 } else if (do_above) { 190 dc0 = vrshrn_n_u16(sum_top, 4); 191 } else if (do_left) { 192 dc0 = vrshrn_n_u16(sum_left, 4); 193 } else { 194 dc0 = vdup_n_u8(0x80); 195 } 196 197 { 198 const uint8x16_t dc = vdupq_lane_u8(dc0, 0); 199 int i; 200 for (i = 0; i < 16; ++i) { 201 vst1q_u8(dst + i * stride, dc); 202 } 203 } 204} 205 206void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 207 const uint8_t *above, const uint8_t *left) { 208 dc_16x16(dst, stride, above, left, 1, 1); 209} 210 211void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 212 const uint8_t *above, 213 const uint8_t *left) { 214 (void)above; 215 dc_16x16(dst, stride, NULL, left, 0, 1); 216} 217 218void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 219 const uint8_t *above, 220 const uint8_t *left) { 221 (void)left; 222 dc_16x16(dst, stride, above, NULL, 1, 0); 223} 224 225void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 226 const uint8_t *above, 227 const uint8_t *left) { 228 (void)above; 229 (void)left; 230 dc_16x16(dst, stride, NULL, NULL, 0, 0); 231} 232 233//------------------------------------------------------------------------------ 234// DC 32x32 235 236// 'do_above' and 'do_left' facilitate branch removal when inlined. 237static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, 238 const uint8_t *above, const uint8_t *left, 239 int do_above, int do_left) { 240 uint16x8_t sum_top; 241 uint16x8_t sum_left; 242 uint8x8_t dc0; 243 244 if (do_above) { 245 const uint8x16_t A0 = vld1q_u8(above); // top row 246 const uint8x16_t A1 = vld1q_u8(above + 16); 247 const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top 248 const uint16x8_t p1 = vpaddlq_u8(A1); 249 const uint16x8_t p2 = vaddq_u16(p0, p1); 250 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); 251 const uint16x4_t p4 = vpadd_u16(p3, p3); 252 const uint16x4_t p5 = vpadd_u16(p4, p4); 253 sum_top = vcombine_u16(p5, p5); 254 } 255 256 if (do_left) { 257 const uint8x16_t L0 = vld1q_u8(left); // left row 258 const uint8x16_t L1 = vld1q_u8(left + 16); 259 const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left 260 const uint16x8_t p1 = vpaddlq_u8(L1); 261 const uint16x8_t p2 = vaddq_u16(p0, p1); 262 const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); 263 const uint16x4_t p4 = vpadd_u16(p3, p3); 264 const uint16x4_t p5 = vpadd_u16(p4, p4); 265 sum_left = vcombine_u16(p5, p5); 266 } 267 268 if (do_above && do_left) { 269 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 270 dc0 = vrshrn_n_u16(sum, 6); 271 } else if (do_above) { 272 dc0 = vrshrn_n_u16(sum_top, 5); 273 } else if (do_left) { 274 dc0 = vrshrn_n_u16(sum_left, 5); 275 } else { 276 dc0 = vdup_n_u8(0x80); 277 } 278 279 { 280 const uint8x16_t dc = vdupq_lane_u8(dc0, 0); 281 int i; 282 for (i = 0; i < 32; ++i) { 283 vst1q_u8(dst + i * stride, dc); 284 vst1q_u8(dst + i * stride + 16, dc); 285 } 286 } 287} 288 289void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 290 const uint8_t *above, const uint8_t *left) { 291 dc_32x32(dst, stride, above, left, 1, 1); 292} 293 294void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 295 const uint8_t *above, 296 const uint8_t *left) { 297 (void)above; 298 dc_32x32(dst, stride, NULL, left, 0, 1); 299} 300 301void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 302 const uint8_t *above, 303 const uint8_t *left) { 304 (void)left; 305 dc_32x32(dst, stride, above, NULL, 1, 0); 306} 307 308void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 309 const uint8_t *above, 310 const uint8_t *left) { 311 (void)above; 312 (void)left; 313 dc_32x32(dst, stride, NULL, NULL, 0, 0); 314} 315 316// ----------------------------------------------------------------------------- 317 318void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 319 const uint8_t *above, const uint8_t *left) { 320 const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row 321 const uint64x1_t A1 = vshr_n_u64(A0, 8); 322 const uint64x1_t A2 = vshr_n_u64(A0, 16); 323 const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); 324 const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); 325 const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); 326 const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); 327 const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); 328 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 329 const uint32x2_t r0 = vreinterpret_u32_u8(avg2); 330 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 331 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 332 const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 333 (void)left; 334 vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); 335 vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); 336 vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); 337 vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); 338 dst[3 * stride + 3] = above[7]; 339} 340 341void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 342 const uint8_t *above, const uint8_t *left) { 343 static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; 344 static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; 345 const uint8x8_t sh_12345677 = vld1_u8(shuffle1); 346 const uint8x8_t sh_23456777 = vld1_u8(shuffle2); 347 const uint8x8_t A0 = vld1_u8(above); // top row 348 const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); 349 const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); 350 const uint8x8_t avg1 = vhadd_u8(A0, A2); 351 uint8x8_t row = vrhadd_u8(avg1, A1); 352 int i; 353 (void)left; 354 for (i = 0; i < 7; ++i) { 355 vst1_u8(dst + i * stride, row); 356 row = vtbl1_u8(row, sh_12345677); 357 } 358 vst1_u8(dst + i * stride, row); 359} 360 361void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 362 const uint8_t *above, const uint8_t *left) { 363 const uint8x16_t A0 = vld1q_u8(above); // top row 364 const uint8x16_t above_right = vld1q_dup_u8(above + 15); 365 const uint8x16_t A1 = vextq_u8(A0, above_right, 1); 366 const uint8x16_t A2 = vextq_u8(A0, above_right, 2); 367 const uint8x16_t avg1 = vhaddq_u8(A0, A2); 368 uint8x16_t row = vrhaddq_u8(avg1, A1); 369 int i; 370 (void)left; 371 for (i = 0; i < 15; ++i) { 372 vst1q_u8(dst + i * stride, row); 373 row = vextq_u8(row, above_right, 1); 374 } 375 vst1q_u8(dst + i * stride, row); 376} 377 378// ----------------------------------------------------------------------------- 379 380void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 381 const uint8_t *above, const uint8_t *left) { 382 const uint8x8_t XABCD_u8 = vld1_u8(above - 1); 383 const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); 384 const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); 385 const uint32x2_t zero = vdup_n_u32(0); 386 const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); 387 const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); 388 const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); 389 const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); 390 const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); 391 const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); 392 const uint8_t D = vget_lane_u8(XABCD_u8, 4); 393 const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); 394 const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); 395 const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); 396 const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); 397 const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); 398 const uint32x2_t r3 = vreinterpret_u32_u8(avg2); 399 const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); 400 const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); 401 const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); 402 vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); 403 vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); 404 vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); 405 vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); 406} 407 408#if !HAVE_NEON_ASM 409 410void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 411 const uint8_t *above, const uint8_t *left) { 412 int i; 413 uint32x2_t d0u32 = vdup_n_u32(0); 414 (void)left; 415 416 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); 417 for (i = 0; i < 4; i++, dst += stride) 418 vst1_lane_u32((uint32_t *)dst, d0u32, 0); 419} 420 421void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 422 const uint8_t *above, const uint8_t *left) { 423 int i; 424 uint8x8_t d0u8 = vdup_n_u8(0); 425 (void)left; 426 427 d0u8 = vld1_u8(above); 428 for (i = 0; i < 8; i++, dst += stride) 429 vst1_u8(dst, d0u8); 430} 431 432void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 433 const uint8_t *above, const uint8_t *left) { 434 int i; 435 uint8x16_t q0u8 = vdupq_n_u8(0); 436 (void)left; 437 438 q0u8 = vld1q_u8(above); 439 for (i = 0; i < 16; i++, dst += stride) 440 vst1q_u8(dst, q0u8); 441} 442 443void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 444 const uint8_t *above, const uint8_t *left) { 445 int i; 446 uint8x16_t q0u8 = vdupq_n_u8(0); 447 uint8x16_t q1u8 = vdupq_n_u8(0); 448 (void)left; 449 450 q0u8 = vld1q_u8(above); 451 q1u8 = vld1q_u8(above + 16); 452 for (i = 0; i < 32; i++, dst += stride) { 453 vst1q_u8(dst, q0u8); 454 vst1q_u8(dst + 16, q1u8); 455 } 456} 457 458void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 459 const uint8_t *above, const uint8_t *left) { 460 uint8x8_t d0u8 = vdup_n_u8(0); 461 uint32x2_t d1u32 = vdup_n_u32(0); 462 (void)above; 463 464 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); 465 466 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); 467 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 468 dst += stride; 469 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); 470 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 471 dst += stride; 472 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); 473 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 474 dst += stride; 475 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); 476 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 477} 478 479void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 480 const uint8_t *above, const uint8_t *left) { 481 uint8x8_t d0u8 = vdup_n_u8(0); 482 uint64x1_t d1u64 = vdup_n_u64(0); 483 (void)above; 484 485 d1u64 = vld1_u64((const uint64_t *)left); 486 487 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); 488 vst1_u8(dst, d0u8); 489 dst += stride; 490 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); 491 vst1_u8(dst, d0u8); 492 dst += stride; 493 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); 494 vst1_u8(dst, d0u8); 495 dst += stride; 496 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); 497 vst1_u8(dst, d0u8); 498 dst += stride; 499 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); 500 vst1_u8(dst, d0u8); 501 dst += stride; 502 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); 503 vst1_u8(dst, d0u8); 504 dst += stride; 505 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); 506 vst1_u8(dst, d0u8); 507 dst += stride; 508 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); 509 vst1_u8(dst, d0u8); 510} 511 512void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 513 const uint8_t *above, const uint8_t *left) { 514 int j; 515 uint8x8_t d2u8 = vdup_n_u8(0); 516 uint8x16_t q0u8 = vdupq_n_u8(0); 517 uint8x16_t q1u8 = vdupq_n_u8(0); 518 (void)above; 519 520 q1u8 = vld1q_u8(left); 521 d2u8 = vget_low_u8(q1u8); 522 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { 523 q0u8 = vdupq_lane_u8(d2u8, 0); 524 vst1q_u8(dst, q0u8); 525 dst += stride; 526 q0u8 = vdupq_lane_u8(d2u8, 1); 527 vst1q_u8(dst, q0u8); 528 dst += stride; 529 q0u8 = vdupq_lane_u8(d2u8, 2); 530 vst1q_u8(dst, q0u8); 531 dst += stride; 532 q0u8 = vdupq_lane_u8(d2u8, 3); 533 vst1q_u8(dst, q0u8); 534 dst += stride; 535 q0u8 = vdupq_lane_u8(d2u8, 4); 536 vst1q_u8(dst, q0u8); 537 dst += stride; 538 q0u8 = vdupq_lane_u8(d2u8, 5); 539 vst1q_u8(dst, q0u8); 540 dst += stride; 541 q0u8 = vdupq_lane_u8(d2u8, 6); 542 vst1q_u8(dst, q0u8); 543 dst += stride; 544 q0u8 = vdupq_lane_u8(d2u8, 7); 545 vst1q_u8(dst, q0u8); 546 dst += stride; 547 } 548} 549 550void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 551 const uint8_t *above, const uint8_t *left) { 552 int j, k; 553 uint8x8_t d2u8 = vdup_n_u8(0); 554 uint8x16_t q0u8 = vdupq_n_u8(0); 555 uint8x16_t q1u8 = vdupq_n_u8(0); 556 (void)above; 557 558 for (k = 0; k < 2; k++, left += 16) { 559 q1u8 = vld1q_u8(left); 560 d2u8 = vget_low_u8(q1u8); 561 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { 562 q0u8 = vdupq_lane_u8(d2u8, 0); 563 vst1q_u8(dst, q0u8); 564 vst1q_u8(dst + 16, q0u8); 565 dst += stride; 566 q0u8 = vdupq_lane_u8(d2u8, 1); 567 vst1q_u8(dst, q0u8); 568 vst1q_u8(dst + 16, q0u8); 569 dst += stride; 570 q0u8 = vdupq_lane_u8(d2u8, 2); 571 vst1q_u8(dst, q0u8); 572 vst1q_u8(dst + 16, q0u8); 573 dst += stride; 574 q0u8 = vdupq_lane_u8(d2u8, 3); 575 vst1q_u8(dst, q0u8); 576 vst1q_u8(dst + 16, q0u8); 577 dst += stride; 578 q0u8 = vdupq_lane_u8(d2u8, 4); 579 vst1q_u8(dst, q0u8); 580 vst1q_u8(dst + 16, q0u8); 581 dst += stride; 582 q0u8 = vdupq_lane_u8(d2u8, 5); 583 vst1q_u8(dst, q0u8); 584 vst1q_u8(dst + 16, q0u8); 585 dst += stride; 586 q0u8 = vdupq_lane_u8(d2u8, 6); 587 vst1q_u8(dst, q0u8); 588 vst1q_u8(dst + 16, q0u8); 589 dst += stride; 590 q0u8 = vdupq_lane_u8(d2u8, 7); 591 vst1q_u8(dst, q0u8); 592 vst1q_u8(dst + 16, q0u8); 593 dst += stride; 594 } 595 } 596} 597 598void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 599 const uint8_t *above, const uint8_t *left) { 600 int i; 601 uint16x8_t q1u16, q3u16; 602 int16x8_t q1s16; 603 uint8x8_t d0u8 = vdup_n_u8(0); 604 uint32x2_t d2u32 = vdup_n_u32(0); 605 606 d0u8 = vld1_dup_u8(above - 1); 607 d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); 608 q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); 609 for (i = 0; i < 4; i++, dst += stride) { 610 q1u16 = vdupq_n_u16((uint16_t)left[i]); 611 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), 612 vreinterpretq_s16_u16(q3u16)); 613 d0u8 = vqmovun_s16(q1s16); 614 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 615 } 616} 617 618void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 619 const uint8_t *above, const uint8_t *left) { 620 int j; 621 uint16x8_t q0u16, q3u16, q10u16; 622 int16x8_t q0s16; 623 uint16x4_t d20u16; 624 uint8x8_t d0u8, d2u8, d30u8; 625 626 d0u8 = vld1_dup_u8(above - 1); 627 d30u8 = vld1_u8(left); 628 d2u8 = vld1_u8(above); 629 q10u16 = vmovl_u8(d30u8); 630 q3u16 = vsubl_u8(d2u8, d0u8); 631 d20u16 = vget_low_u16(q10u16); 632 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { 633 q0u16 = vdupq_lane_u16(d20u16, 0); 634 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 635 vreinterpretq_s16_u16(q0u16)); 636 d0u8 = vqmovun_s16(q0s16); 637 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 638 dst += stride; 639 q0u16 = vdupq_lane_u16(d20u16, 1); 640 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 641 vreinterpretq_s16_u16(q0u16)); 642 d0u8 = vqmovun_s16(q0s16); 643 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 644 dst += stride; 645 q0u16 = vdupq_lane_u16(d20u16, 2); 646 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 647 vreinterpretq_s16_u16(q0u16)); 648 d0u8 = vqmovun_s16(q0s16); 649 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 650 dst += stride; 651 q0u16 = vdupq_lane_u16(d20u16, 3); 652 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 653 vreinterpretq_s16_u16(q0u16)); 654 d0u8 = vqmovun_s16(q0s16); 655 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 656 dst += stride; 657 } 658} 659 660void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 661 const uint8_t *above, const uint8_t *left) { 662 int j, k; 663 uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; 664 uint8x16_t q0u8, q1u8; 665 int16x8_t q0s16, q1s16, q8s16, q11s16; 666 uint16x4_t d20u16; 667 uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; 668 669 q0u8 = vld1q_dup_u8(above - 1); 670 q1u8 = vld1q_u8(above); 671 q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); 672 q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); 673 for (k = 0; k < 2; k++, left += 8) { 674 d18u8 = vld1_u8(left); 675 q10u16 = vmovl_u8(d18u8); 676 d20u16 = vget_low_u16(q10u16); 677 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { 678 q0u16 = vdupq_lane_u16(d20u16, 0); 679 q8u16 = vdupq_lane_u16(d20u16, 1); 680 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 681 vreinterpretq_s16_u16(q2u16)); 682 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 683 vreinterpretq_s16_u16(q3u16)); 684 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 685 vreinterpretq_s16_u16(q2u16)); 686 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 687 vreinterpretq_s16_u16(q3u16)); 688 d2u8 = vqmovun_s16(q1s16); 689 d3u8 = vqmovun_s16(q0s16); 690 d22u8 = vqmovun_s16(q11s16); 691 d23u8 = vqmovun_s16(q8s16); 692 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); 693 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); 694 dst += stride; 695 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); 696 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); 697 dst += stride; 698 699 q0u16 = vdupq_lane_u16(d20u16, 2); 700 q8u16 = vdupq_lane_u16(d20u16, 3); 701 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 702 vreinterpretq_s16_u16(q2u16)); 703 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 704 vreinterpretq_s16_u16(q3u16)); 705 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 706 vreinterpretq_s16_u16(q2u16)); 707 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 708 vreinterpretq_s16_u16(q3u16)); 709 d2u8 = vqmovun_s16(q1s16); 710 d3u8 = vqmovun_s16(q0s16); 711 d22u8 = vqmovun_s16(q11s16); 712 d23u8 = vqmovun_s16(q8s16); 713 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); 714 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); 715 dst += stride; 716 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); 717 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); 718 dst += stride; 719 } 720 } 721} 722 723void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 724 const uint8_t *above, const uint8_t *left) { 725 int j, k; 726 uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; 727 uint8x16_t q0u8, q1u8, q2u8; 728 int16x8_t q12s16, q13s16, q14s16, q15s16; 729 uint16x4_t d6u16; 730 uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; 731 732 q0u8 = vld1q_dup_u8(above - 1); 733 q1u8 = vld1q_u8(above); 734 q2u8 = vld1q_u8(above + 16); 735 q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); 736 q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); 737 q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); 738 q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); 739 for (k = 0; k < 4; k++, left += 8) { 740 d26u8 = vld1_u8(left); 741 q3u16 = vmovl_u8(d26u8); 742 d6u16 = vget_low_u16(q3u16); 743 for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { 744 q0u16 = vdupq_lane_u16(d6u16, 0); 745 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 746 vreinterpretq_s16_u16(q8u16)); 747 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 748 vreinterpretq_s16_u16(q9u16)); 749 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 750 vreinterpretq_s16_u16(q10u16)); 751 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 752 vreinterpretq_s16_u16(q11u16)); 753 d0u8 = vqmovun_s16(q12s16); 754 d1u8 = vqmovun_s16(q13s16); 755 d2u8 = vqmovun_s16(q14s16); 756 d3u8 = vqmovun_s16(q15s16); 757 q0u8 = vcombine_u8(d0u8, d1u8); 758 q1u8 = vcombine_u8(d2u8, d3u8); 759 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); 760 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); 761 dst += stride; 762 763 q0u16 = vdupq_lane_u16(d6u16, 1); 764 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 765 vreinterpretq_s16_u16(q8u16)); 766 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 767 vreinterpretq_s16_u16(q9u16)); 768 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 769 vreinterpretq_s16_u16(q10u16)); 770 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 771 vreinterpretq_s16_u16(q11u16)); 772 d0u8 = vqmovun_s16(q12s16); 773 d1u8 = vqmovun_s16(q13s16); 774 d2u8 = vqmovun_s16(q14s16); 775 d3u8 = vqmovun_s16(q15s16); 776 q0u8 = vcombine_u8(d0u8, d1u8); 777 q1u8 = vcombine_u8(d2u8, d3u8); 778 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); 779 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); 780 dst += stride; 781 782 q0u16 = vdupq_lane_u16(d6u16, 2); 783 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 784 vreinterpretq_s16_u16(q8u16)); 785 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 786 vreinterpretq_s16_u16(q9u16)); 787 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 788 vreinterpretq_s16_u16(q10u16)); 789 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 790 vreinterpretq_s16_u16(q11u16)); 791 d0u8 = vqmovun_s16(q12s16); 792 d1u8 = vqmovun_s16(q13s16); 793 d2u8 = vqmovun_s16(q14s16); 794 d3u8 = vqmovun_s16(q15s16); 795 q0u8 = vcombine_u8(d0u8, d1u8); 796 q1u8 = vcombine_u8(d2u8, d3u8); 797 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); 798 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); 799 dst += stride; 800 801 q0u16 = vdupq_lane_u16(d6u16, 3); 802 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 803 vreinterpretq_s16_u16(q8u16)); 804 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 805 vreinterpretq_s16_u16(q9u16)); 806 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 807 vreinterpretq_s16_u16(q10u16)); 808 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 809 vreinterpretq_s16_u16(q11u16)); 810 d0u8 = vqmovun_s16(q12s16); 811 d1u8 = vqmovun_s16(q13s16); 812 d2u8 = vqmovun_s16(q14s16); 813 d3u8 = vqmovun_s16(q15s16); 814 q0u8 = vcombine_u8(d0u8, d1u8); 815 q1u8 = vcombine_u8(d2u8, d3u8); 816 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); 817 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); 818 dst += stride; 819 } 820 } 821} 822#endif // !HAVE_NEON_ASM 823