1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC Neon. 19#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) 20 21// NEON downscalers with interpolation. 22// Provided by Fritz Koenig 23 24// Read 32x1 throw away even pixels, and write 16x1. 25void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 26 uint8* dst, int dst_width) { 27 asm volatile ( 28 ".p2align 2 \n" 29 "1: \n" 30 // load even pixels into q0, odd into q1 31 MEMACCESS(0) 32 "vld2.8 {q0, q1}, [%0]! \n" 33 "subs %2, %2, #16 \n" // 16 processed per loop 34 MEMACCESS(1) 35 "vst1.8 {q1}, [%1]! \n" // store odd pixels 36 "bgt 1b \n" 37 : "+r"(src_ptr), // %0 38 "+r"(dst), // %1 39 "+r"(dst_width) // %2 40 : 41 : "q0", "q1" // Clobber List 42 ); 43} 44 45// Read 32x2 average down and write 16x1. 46void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 47 uint8* dst, int dst_width) { 48 asm volatile ( 49 // change the stride to row 2 pointer 50 "add %1, %0 \n" 51 ".p2align 2 \n" 52 "1: \n" 53 MEMACCESS(0) 54 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 55 MEMACCESS(1) 56 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc 57 "subs %3, %3, #16 \n" // 16 processed per loop 58 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 59 "vpaddl.u8 q1, q1 \n" 60 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 61 "vpadal.u8 q1, q3 \n" 62 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 63 "vrshrn.u16 d1, q1, #2 \n" 64 MEMACCESS(2) 65 "vst1.8 {q0}, [%2]! \n" 66 "bgt 1b \n" 67 : "+r"(src_ptr), // %0 68 "+r"(src_stride), // %1 69 "+r"(dst), // %2 70 "+r"(dst_width) // %3 71 : 72 : "q0", "q1", "q2", "q3" // Clobber List 73 ); 74} 75 76void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 77 uint8* dst_ptr, int dst_width) { 78 asm volatile ( 79 ".p2align 2 \n" 80 "1: \n" 81 MEMACCESS(0) 82 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 83 "subs %2, %2, #8 \n" // 8 processed per loop 84 MEMACCESS(1) 85 "vst1.8 {d2}, [%1]! \n" 86 "bgt 1b \n" 87 : "+r"(src_ptr), // %0 88 "+r"(dst_ptr), // %1 89 "+r"(dst_width) // %2 90 : 91 : "q0", "q1", "memory", "cc" 92 ); 93} 94 95void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 96 uint8* dst_ptr, int dst_width) { 97 const uint8* src_ptr1 = src_ptr + src_stride; 98 const uint8* src_ptr2 = src_ptr + src_stride * 2; 99 const uint8* src_ptr3 = src_ptr + src_stride * 3; 100asm volatile ( 101 ".p2align 2 \n" 102 "1: \n" 103 MEMACCESS(0) 104 "vld1.8 {q0}, [%0]! \n" // load up 16x4 105 MEMACCESS(3) 106 "vld1.8 {q1}, [%3]! \n" 107 MEMACCESS(4) 108 "vld1.8 {q2}, [%4]! \n" 109 MEMACCESS(5) 110 "vld1.8 {q3}, [%5]! \n" 111 "subs %2, %2, #4 \n" 112 "vpaddl.u8 q0, q0 \n" 113 "vpadal.u8 q0, q1 \n" 114 "vpadal.u8 q0, q2 \n" 115 "vpadal.u8 q0, q3 \n" 116 "vpaddl.u16 q0, q0 \n" 117 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding 118 "vmovn.u16 d0, q0 \n" 119 MEMACCESS(1) 120 "vst1.32 {d0[0]}, [%1]! \n" 121 "bgt 1b \n" 122 : "+r"(src_ptr), // %0 123 "+r"(dst_ptr), // %1 124 "+r"(dst_width), // %2 125 "+r"(src_ptr1), // %3 126 "+r"(src_ptr2), // %4 127 "+r"(src_ptr3) // %5 128 : 129 : "q0", "q1", "q2", "q3", "memory", "cc" 130 ); 131} 132 133// Down scale from 4 to 3 pixels. Use the neon multilane read/write 134// to load up the every 4th pixel into a 4 different registers. 135// Point samples 32 pixels to 24 pixels. 136void ScaleRowDown34_NEON(const uint8* src_ptr, 137 ptrdiff_t src_stride, 138 uint8* dst_ptr, int dst_width) { 139 asm volatile ( 140 ".p2align 2 \n" 141 "1: \n" 142 MEMACCESS(0) 143 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 144 "subs %2, %2, #24 \n" 145 "vmov d2, d3 \n" // order d0, d1, d2 146 MEMACCESS(1) 147 "vst3.8 {d0, d1, d2}, [%1]! \n" 148 "bgt 1b \n" 149 : "+r"(src_ptr), // %0 150 "+r"(dst_ptr), // %1 151 "+r"(dst_width) // %2 152 : 153 : "d0", "d1", "d2", "d3", "memory", "cc" 154 ); 155} 156 157void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 158 ptrdiff_t src_stride, 159 uint8* dst_ptr, int dst_width) { 160 asm volatile ( 161 "vmov.u8 d24, #3 \n" 162 "add %3, %0 \n" 163 ".p2align 2 \n" 164 "1: \n" 165 MEMACCESS(0) 166 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 167 MEMACCESS(3) 168 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 169 "subs %2, %2, #24 \n" 170 171 // filter src line 0 with src line 1 172 // expand chars to shorts to allow for room 173 // when adding lines together 174 "vmovl.u8 q8, d4 \n" 175 "vmovl.u8 q9, d5 \n" 176 "vmovl.u8 q10, d6 \n" 177 "vmovl.u8 q11, d7 \n" 178 179 // 3 * line_0 + line_1 180 "vmlal.u8 q8, d0, d24 \n" 181 "vmlal.u8 q9, d1, d24 \n" 182 "vmlal.u8 q10, d2, d24 \n" 183 "vmlal.u8 q11, d3, d24 \n" 184 185 // (3 * line_0 + line_1) >> 2 186 "vqrshrn.u16 d0, q8, #2 \n" 187 "vqrshrn.u16 d1, q9, #2 \n" 188 "vqrshrn.u16 d2, q10, #2 \n" 189 "vqrshrn.u16 d3, q11, #2 \n" 190 191 // a0 = (src[0] * 3 + s[1] * 1) >> 2 192 "vmovl.u8 q8, d1 \n" 193 "vmlal.u8 q8, d0, d24 \n" 194 "vqrshrn.u16 d0, q8, #2 \n" 195 196 // a1 = (src[1] * 1 + s[2] * 1) >> 1 197 "vrhadd.u8 d1, d1, d2 \n" 198 199 // a2 = (src[2] * 1 + s[3] * 3) >> 2 200 "vmovl.u8 q8, d2 \n" 201 "vmlal.u8 q8, d3, d24 \n" 202 "vqrshrn.u16 d2, q8, #2 \n" 203 204 MEMACCESS(1) 205 "vst3.8 {d0, d1, d2}, [%1]! \n" 206 207 "bgt 1b \n" 208 : "+r"(src_ptr), // %0 209 "+r"(dst_ptr), // %1 210 "+r"(dst_width), // %2 211 "+r"(src_stride) // %3 212 : 213 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" 214 ); 215} 216 217void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, 218 ptrdiff_t src_stride, 219 uint8* dst_ptr, int dst_width) { 220 asm volatile ( 221 "vmov.u8 d24, #3 \n" 222 "add %3, %0 \n" 223 ".p2align 2 \n" 224 "1: \n" 225 MEMACCESS(0) 226 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 227 MEMACCESS(3) 228 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 229 "subs %2, %2, #24 \n" 230 // average src line 0 with src line 1 231 "vrhadd.u8 q0, q0, q2 \n" 232 "vrhadd.u8 q1, q1, q3 \n" 233 234 // a0 = (src[0] * 3 + s[1] * 1) >> 2 235 "vmovl.u8 q3, d1 \n" 236 "vmlal.u8 q3, d0, d24 \n" 237 "vqrshrn.u16 d0, q3, #2 \n" 238 239 // a1 = (src[1] * 1 + s[2] * 1) >> 1 240 "vrhadd.u8 d1, d1, d2 \n" 241 242 // a2 = (src[2] * 1 + s[3] * 3) >> 2 243 "vmovl.u8 q3, d2 \n" 244 "vmlal.u8 q3, d3, d24 \n" 245 "vqrshrn.u16 d2, q3, #2 \n" 246 247 MEMACCESS(1) 248 "vst3.8 {d0, d1, d2}, [%1]! \n" 249 "bgt 1b \n" 250 : "+r"(src_ptr), // %0 251 "+r"(dst_ptr), // %1 252 "+r"(dst_width), // %2 253 "+r"(src_stride) // %3 254 : 255 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" 256 ); 257} 258 259#define HAS_SCALEROWDOWN38_NEON 260static uvec8 kShuf38 = 261 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 262static uvec8 kShuf38_2 = 263 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; 264static vec16 kMult38_Div6 = 265 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 266 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 267static vec16 kMult38_Div9 = 268 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 269 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 270 271// 32 -> 12 272void ScaleRowDown38_NEON(const uint8* src_ptr, 273 ptrdiff_t src_stride, 274 uint8* dst_ptr, int dst_width) { 275 asm volatile ( 276 MEMACCESS(3) 277 "vld1.8 {q3}, [%3] \n" 278 ".p2align 2 \n" 279 "1: \n" 280 MEMACCESS(0) 281 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" 282 "subs %2, %2, #12 \n" 283 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" 284 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" 285 MEMACCESS(1) 286 "vst1.8 {d4}, [%1]! \n" 287 MEMACCESS(1) 288 "vst1.32 {d5[0]}, [%1]! \n" 289 "bgt 1b \n" 290 : "+r"(src_ptr), // %0 291 "+r"(dst_ptr), // %1 292 "+r"(dst_width) // %2 293 : "r"(&kShuf38) // %3 294 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" 295 ); 296} 297 298// 32x3 -> 12x1 299void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, 300 ptrdiff_t src_stride, 301 uint8* dst_ptr, int dst_width) { 302 const uint8* src_ptr1 = src_ptr + src_stride * 2; 303 304 asm volatile ( 305 MEMACCESS(5) 306 "vld1.16 {q13}, [%5] \n" 307 MEMACCESS(6) 308 "vld1.8 {q14}, [%6] \n" 309 MEMACCESS(7) 310 "vld1.8 {q15}, [%7] \n" 311 "add %3, %0 \n" 312 ".p2align 2 \n" 313 "1: \n" 314 315 // d0 = 00 40 01 41 02 42 03 43 316 // d1 = 10 50 11 51 12 52 13 53 317 // d2 = 20 60 21 61 22 62 23 63 318 // d3 = 30 70 31 71 32 72 33 73 319 MEMACCESS(0) 320 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" 321 MEMACCESS(3) 322 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 323 MEMACCESS(4) 324 "vld4.8 {d16, d17, d18, d19}, [%4]! \n" 325 "subs %2, %2, #12 \n" 326 327 // Shuffle the input data around to get align the data 328 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 329 // d0 = 00 10 01 11 02 12 03 13 330 // d1 = 40 50 41 51 42 52 43 53 331 "vtrn.u8 d0, d1 \n" 332 "vtrn.u8 d4, d5 \n" 333 "vtrn.u8 d16, d17 \n" 334 335 // d2 = 20 30 21 31 22 32 23 33 336 // d3 = 60 70 61 71 62 72 63 73 337 "vtrn.u8 d2, d3 \n" 338 "vtrn.u8 d6, d7 \n" 339 "vtrn.u8 d18, d19 \n" 340 341 // d0 = 00+10 01+11 02+12 03+13 342 // d2 = 40+50 41+51 42+52 43+53 343 "vpaddl.u8 q0, q0 \n" 344 "vpaddl.u8 q2, q2 \n" 345 "vpaddl.u8 q8, q8 \n" 346 347 // d3 = 60+70 61+71 62+72 63+73 348 "vpaddl.u8 d3, d3 \n" 349 "vpaddl.u8 d7, d7 \n" 350 "vpaddl.u8 d19, d19 \n" 351 352 // combine source lines 353 "vadd.u16 q0, q2 \n" 354 "vadd.u16 q0, q8 \n" 355 "vadd.u16 d4, d3, d7 \n" 356 "vadd.u16 d4, d19 \n" 357 358 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 359 // + s[6 + st * 1] + s[7 + st * 1] 360 // + s[6 + st * 2] + s[7 + st * 2]) / 6 361 "vqrdmulh.s16 q2, q2, q13 \n" 362 "vmovn.u16 d4, q2 \n" 363 364 // Shuffle 2,3 reg around so that 2 can be added to the 365 // 0,1 reg and 3 can be added to the 4,5 reg. This 366 // requires expanding from u8 to u16 as the 0,1 and 4,5 367 // registers are already expanded. Then do transposes 368 // to get aligned. 369 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 370 "vmovl.u8 q1, d2 \n" 371 "vmovl.u8 q3, d6 \n" 372 "vmovl.u8 q9, d18 \n" 373 374 // combine source lines 375 "vadd.u16 q1, q3 \n" 376 "vadd.u16 q1, q9 \n" 377 378 // d4 = xx 20 xx 30 xx 22 xx 32 379 // d5 = xx 21 xx 31 xx 23 xx 33 380 "vtrn.u32 d2, d3 \n" 381 382 // d4 = xx 20 xx 21 xx 22 xx 23 383 // d5 = xx 30 xx 31 xx 32 xx 33 384 "vtrn.u16 d2, d3 \n" 385 386 // 0+1+2, 3+4+5 387 "vadd.u16 q0, q1 \n" 388 389 // Need to divide, but can't downshift as the the value 390 // isn't a power of 2. So multiply by 65536 / n 391 // and take the upper 16 bits. 392 "vqrdmulh.s16 q0, q0, q15 \n" 393 394 // Align for table lookup, vtbl requires registers to 395 // be adjacent 396 "vmov.u8 d2, d4 \n" 397 398 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 399 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 400 401 MEMACCESS(1) 402 "vst1.8 {d3}, [%1]! \n" 403 MEMACCESS(1) 404 "vst1.32 {d4[0]}, [%1]! \n" 405 "bgt 1b \n" 406 : "+r"(src_ptr), // %0 407 "+r"(dst_ptr), // %1 408 "+r"(dst_width), // %2 409 "+r"(src_stride), // %3 410 "+r"(src_ptr1) // %4 411 : "r"(&kMult38_Div6), // %5 412 "r"(&kShuf38_2), // %6 413 "r"(&kMult38_Div9) // %7 414 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc" 415 ); 416} 417 418// 32x2 -> 12x1 419void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, 420 ptrdiff_t src_stride, 421 uint8* dst_ptr, int dst_width) { 422 asm volatile ( 423 MEMACCESS(4) 424 "vld1.16 {q13}, [%4] \n" 425 MEMACCESS(5) 426 "vld1.8 {q14}, [%5] \n" 427 "add %3, %0 \n" 428 ".p2align 2 \n" 429 "1: \n" 430 431 // d0 = 00 40 01 41 02 42 03 43 432 // d1 = 10 50 11 51 12 52 13 53 433 // d2 = 20 60 21 61 22 62 23 63 434 // d3 = 30 70 31 71 32 72 33 73 435 MEMACCESS(0) 436 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" 437 MEMACCESS(3) 438 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 439 "subs %2, %2, #12 \n" 440 441 // Shuffle the input data around to get align the data 442 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 443 // d0 = 00 10 01 11 02 12 03 13 444 // d1 = 40 50 41 51 42 52 43 53 445 "vtrn.u8 d0, d1 \n" 446 "vtrn.u8 d4, d5 \n" 447 448 // d2 = 20 30 21 31 22 32 23 33 449 // d3 = 60 70 61 71 62 72 63 73 450 "vtrn.u8 d2, d3 \n" 451 "vtrn.u8 d6, d7 \n" 452 453 // d0 = 00+10 01+11 02+12 03+13 454 // d2 = 40+50 41+51 42+52 43+53 455 "vpaddl.u8 q0, q0 \n" 456 "vpaddl.u8 q2, q2 \n" 457 458 // d3 = 60+70 61+71 62+72 63+73 459 "vpaddl.u8 d3, d3 \n" 460 "vpaddl.u8 d7, d7 \n" 461 462 // combine source lines 463 "vadd.u16 q0, q2 \n" 464 "vadd.u16 d4, d3, d7 \n" 465 466 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 467 "vqrshrn.u16 d4, q2, #2 \n" 468 469 // Shuffle 2,3 reg around so that 2 can be added to the 470 // 0,1 reg and 3 can be added to the 4,5 reg. This 471 // requires expanding from u8 to u16 as the 0,1 and 4,5 472 // registers are already expanded. Then do transposes 473 // to get aligned. 474 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 475 "vmovl.u8 q1, d2 \n" 476 "vmovl.u8 q3, d6 \n" 477 478 // combine source lines 479 "vadd.u16 q1, q3 \n" 480 481 // d4 = xx 20 xx 30 xx 22 xx 32 482 // d5 = xx 21 xx 31 xx 23 xx 33 483 "vtrn.u32 d2, d3 \n" 484 485 // d4 = xx 20 xx 21 xx 22 xx 23 486 // d5 = xx 30 xx 31 xx 32 xx 33 487 "vtrn.u16 d2, d3 \n" 488 489 // 0+1+2, 3+4+5 490 "vadd.u16 q0, q1 \n" 491 492 // Need to divide, but can't downshift as the the value 493 // isn't a power of 2. So multiply by 65536 / n 494 // and take the upper 16 bits. 495 "vqrdmulh.s16 q0, q0, q13 \n" 496 497 // Align for table lookup, vtbl requires registers to 498 // be adjacent 499 "vmov.u8 d2, d4 \n" 500 501 "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 502 "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 503 504 MEMACCESS(1) 505 "vst1.8 {d3}, [%1]! \n" 506 MEMACCESS(1) 507 "vst1.32 {d4[0]}, [%1]! \n" 508 "bgt 1b \n" 509 : "+r"(src_ptr), // %0 510 "+r"(dst_ptr), // %1 511 "+r"(dst_width), // %2 512 "+r"(src_stride) // %3 513 : "r"(&kMult38_Div6), // %4 514 "r"(&kShuf38_2) // %5 515 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 516 ); 517} 518 519// 16x2 -> 16x1 520void ScaleFilterRows_NEON(uint8* dst_ptr, 521 const uint8* src_ptr, ptrdiff_t src_stride, 522 int dst_width, int source_y_fraction) { 523 asm volatile ( 524 "cmp %4, #0 \n" 525 "beq 100f \n" 526 "add %2, %1 \n" 527 "cmp %4, #64 \n" 528 "beq 75f \n" 529 "cmp %4, #128 \n" 530 "beq 50f \n" 531 "cmp %4, #192 \n" 532 "beq 25f \n" 533 534 "vdup.8 d5, %4 \n" 535 "rsb %4, #256 \n" 536 "vdup.8 d4, %4 \n" 537 // General purpose row blend. 538 "1: \n" 539 MEMACCESS(1) 540 "vld1.8 {q0}, [%1]! \n" 541 MEMACCESS(2) 542 "vld1.8 {q1}, [%2]! \n" 543 "subs %3, %3, #16 \n" 544 "vmull.u8 q13, d0, d4 \n" 545 "vmull.u8 q14, d1, d4 \n" 546 "vmlal.u8 q13, d2, d5 \n" 547 "vmlal.u8 q14, d3, d5 \n" 548 "vrshrn.u16 d0, q13, #8 \n" 549 "vrshrn.u16 d1, q14, #8 \n" 550 MEMACCESS(0) 551 "vst1.8 {q0}, [%0]! \n" 552 "bgt 1b \n" 553 "b 99f \n" 554 555 // Blend 25 / 75. 556 "25: \n" 557 MEMACCESS(1) 558 "vld1.8 {q0}, [%1]! \n" 559 MEMACCESS(2) 560 "vld1.8 {q1}, [%2]! \n" 561 "subs %3, %3, #16 \n" 562 "vrhadd.u8 q0, q1 \n" 563 "vrhadd.u8 q0, q1 \n" 564 MEMACCESS(0) 565 "vst1.8 {q0}, [%0]! \n" 566 "bgt 25b \n" 567 "b 99f \n" 568 569 // Blend 50 / 50. 570 "50: \n" 571 MEMACCESS(1) 572 "vld1.8 {q0}, [%1]! \n" 573 MEMACCESS(2) 574 "vld1.8 {q1}, [%2]! \n" 575 "subs %3, %3, #16 \n" 576 "vrhadd.u8 q0, q1 \n" 577 MEMACCESS(0) 578 "vst1.8 {q0}, [%0]! \n" 579 "bgt 50b \n" 580 "b 99f \n" 581 582 // Blend 75 / 25. 583 "75: \n" 584 MEMACCESS(1) 585 "vld1.8 {q1}, [%1]! \n" 586 MEMACCESS(2) 587 "vld1.8 {q0}, [%2]! \n" 588 "subs %3, %3, #16 \n" 589 "vrhadd.u8 q0, q1 \n" 590 "vrhadd.u8 q0, q1 \n" 591 MEMACCESS(0) 592 "vst1.8 {q0}, [%0]! \n" 593 "bgt 75b \n" 594 "b 99f \n" 595 596 // Blend 100 / 0 - Copy row unchanged. 597 "100: \n" 598 MEMACCESS(1) 599 "vld1.8 {q0}, [%1]! \n" 600 "subs %3, %3, #16 \n" 601 MEMACCESS(0) 602 "vst1.8 {q0}, [%0]! \n" 603 "bgt 100b \n" 604 605 "99: \n" 606 MEMACCESS(0) 607 "vst1.8 {d1[7]}, [%0] \n" 608 : "+r"(dst_ptr), // %0 609 "+r"(src_ptr), // %1 610 "+r"(src_stride), // %2 611 "+r"(dst_width), // %3 612 "+r"(source_y_fraction) // %4 613 : 614 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" 615 ); 616} 617 618void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 619 uint8* dst, int dst_width) { 620 asm volatile ( 621 ".p2align 2 \n" 622 "1: \n" 623 // load even pixels into q0, odd into q1 624 MEMACCESS(0) 625 "vld2.32 {q0, q1}, [%0]! \n" 626 MEMACCESS(0) 627 "vld2.32 {q2, q3}, [%0]! \n" 628 "subs %2, %2, #8 \n" // 8 processed per loop 629 MEMACCESS(1) 630 "vst1.8 {q1}, [%1]! \n" // store odd pixels 631 MEMACCESS(1) 632 "vst1.8 {q3}, [%1]! \n" 633 "bgt 1b \n" 634 : "+r"(src_ptr), // %0 635 "+r"(dst), // %1 636 "+r"(dst_width) // %2 637 : 638 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List 639 ); 640} 641 642void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 643 uint8* dst, int dst_width) { 644 asm volatile ( 645 // change the stride to row 2 pointer 646 "add %1, %1, %0 \n" 647 ".p2align 2 \n" 648 "1: \n" 649 MEMACCESS(0) 650 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 651 MEMACCESS(0) 652 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 653 "subs %3, %3, #8 \n" // 8 processed per loop. 654 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 655 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 656 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 657 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 658 MEMACCESS(1) 659 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. 660 MEMACCESS(1) 661 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. 662 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. 663 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. 664 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. 665 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. 666 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 667 "vrshrn.u16 d1, q1, #2 \n" 668 "vrshrn.u16 d2, q2, #2 \n" 669 "vrshrn.u16 d3, q3, #2 \n" 670 MEMACCESS(2) 671 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" 672 "bgt 1b \n" 673 : "+r"(src_ptr), // %0 674 "+r"(src_stride), // %1 675 "+r"(dst), // %2 676 "+r"(dst_width) // %3 677 : 678 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 679 ); 680} 681 682// Reads 4 pixels at a time. 683// Alignment requirement: src_argb 4 byte aligned. 684void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, 685 int src_stepx, uint8* dst_argb, int dst_width) { 686 asm volatile ( 687 "mov r12, %3, lsl #2 \n" 688 ".p2align 2 \n" 689 "1: \n" 690 MEMACCESS(0) 691 "vld1.32 {d0[0]}, [%0], r12 \n" 692 MEMACCESS(0) 693 "vld1.32 {d0[1]}, [%0], r12 \n" 694 MEMACCESS(0) 695 "vld1.32 {d1[0]}, [%0], r12 \n" 696 MEMACCESS(0) 697 "vld1.32 {d1[1]}, [%0], r12 \n" 698 "subs %2, %2, #4 \n" // 4 pixels per loop. 699 MEMACCESS(1) 700 "vst1.8 {q0}, [%1]! \n" 701 "bgt 1b \n" 702 : "+r"(src_argb), // %0 703 "+r"(dst_argb), // %1 704 "+r"(dst_width) // %2 705 : "r"(src_stepx) // %3 706 : "memory", "cc", "r12", "q0" 707 ); 708} 709 710// Reads 4 pixels at a time. 711// Alignment requirement: src_argb 4 byte aligned. 712void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 713 int src_stepx, 714 uint8* dst_argb, int dst_width) { 715 asm volatile ( 716 "mov r12, %4, lsl #2 \n" 717 "add %1, %1, %0 \n" 718 ".p2align 2 \n" 719 "1: \n" 720 MEMACCESS(0) 721 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 722 MEMACCESS(1) 723 "vld1.8 {d1}, [%1], r12 \n" 724 MEMACCESS(0) 725 "vld1.8 {d2}, [%0], r12 \n" 726 MEMACCESS(1) 727 "vld1.8 {d3}, [%1], r12 \n" 728 MEMACCESS(0) 729 "vld1.8 {d4}, [%0], r12 \n" 730 MEMACCESS(1) 731 "vld1.8 {d5}, [%1], r12 \n" 732 MEMACCESS(0) 733 "vld1.8 {d6}, [%0], r12 \n" 734 MEMACCESS(1) 735 "vld1.8 {d7}, [%1], r12 \n" 736 "vaddl.u8 q0, d0, d1 \n" 737 "vaddl.u8 q1, d2, d3 \n" 738 "vaddl.u8 q2, d4, d5 \n" 739 "vaddl.u8 q3, d6, d7 \n" 740 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd 741 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh 742 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) 743 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) 744 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. 745 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. 746 "subs %3, %3, #4 \n" // 4 pixels per loop. 747 MEMACCESS(2) 748 "vst1.8 {q0}, [%2]! \n" 749 "bgt 1b \n" 750 : "+r"(src_argb), // %0 751 "+r"(src_stride), // %1 752 "+r"(dst_argb), // %2 753 "+r"(dst_width) // %3 754 : "r"(src_stepx) // %4 755 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" 756 ); 757} 758 759#endif // __ARM_NEON__ 760 761#ifdef __cplusplus 762} // extern "C" 763} // namespace libyuv 764#endif 765