1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC Neon 19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 20 21// Read 8 Y, 4 U and 4 V from 422 22#define READYUV422 \ 23 MEMACCESS(0) \ 24 "vld1.8 {d0}, [%0]! \n" \ 25 MEMACCESS(1) \ 26 "vld1.32 {d2[0]}, [%1]! \n" \ 27 MEMACCESS(2) \ 28 "vld1.32 {d2[1]}, [%2]! \n" 29 30// Read 8 Y, 2 U and 2 V from 422 31#define READYUV411 \ 32 MEMACCESS(0) \ 33 "vld1.8 {d0}, [%0]! \n" \ 34 MEMACCESS(1) \ 35 "vld1.16 {d2[0]}, [%1]! \n" \ 36 MEMACCESS(2) \ 37 "vld1.16 {d2[1]}, [%2]! \n" \ 38 "vmov.u8 d3, d2 \n" \ 39 "vzip.u8 d2, d3 \n" 40 41// Read 8 Y, 8 U and 8 V from 444 42#define READYUV444 \ 43 MEMACCESS(0) \ 44 "vld1.8 {d0}, [%0]! \n" \ 45 MEMACCESS(1) \ 46 "vld1.8 {d2}, [%1]! \n" \ 47 MEMACCESS(2) \ 48 "vld1.8 {d3}, [%2]! \n" \ 49 "vpaddl.u8 q1, q1 \n" \ 50 "vrshrn.u16 d2, q1, #1 \n" 51 52// Read 8 Y, and set 4 U and 4 V to 128 53#define READYUV400 \ 54 MEMACCESS(0) \ 55 "vld1.8 {d0}, [%0]! \n" \ 56 "vmov.u8 d2, #128 \n" 57 58// Read 8 Y and 4 UV from NV12 59#define READNV12 \ 60 MEMACCESS(0) \ 61 "vld1.8 {d0}, [%0]! \n" \ 62 MEMACCESS(1) \ 63 "vld1.8 {d2}, [%1]! \n" \ 64 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ 65 "vuzp.u8 d2, d3 \n" \ 66 "vtrn.u32 d2, d3 \n" 67 68// Read 8 Y and 4 VU from NV21 69#define READNV21 \ 70 MEMACCESS(0) \ 71 "vld1.8 {d0}, [%0]! \n" \ 72 MEMACCESS(1) \ 73 "vld1.8 {d2}, [%1]! \n" \ 74 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ 75 "vuzp.u8 d3, d2 \n" \ 76 "vtrn.u32 d2, d3 \n" 77 78// Read 8 YUY2 79#define READYUY2 \ 80 MEMACCESS(0) \ 81 "vld2.8 {d0, d2}, [%0]! \n" \ 82 "vmov.u8 d3, d2 \n" \ 83 "vuzp.u8 d2, d3 \n" \ 84 "vtrn.u32 d2, d3 \n" 85 86// Read 8 UYVY 87#define READUYVY \ 88 MEMACCESS(0) \ 89 "vld2.8 {d2, d3}, [%0]! \n" \ 90 "vmov.u8 d0, d3 \n" \ 91 "vmov.u8 d3, d2 \n" \ 92 "vuzp.u8 d2, d3 \n" \ 93 "vtrn.u32 d2, d3 \n" 94 95#define YUV422TORGB \ 96 "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ 97 "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ 98 "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ 99 "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ 100 "vtrn.u8 d0, d1 \n" \ 101 "vsub.s16 q0, q0, q15 \n"/* offset y */\ 102 "vmul.s16 q0, q0, q14 \n" \ 103 "vadd.s16 d18, d19 \n" \ 104 "vqadd.s16 d20, d0, d16 \n" /* B */ \ 105 "vqadd.s16 d21, d1, d16 \n" \ 106 "vqadd.s16 d22, d0, d17 \n" /* R */ \ 107 "vqadd.s16 d23, d1, d17 \n" \ 108 "vqadd.s16 d16, d0, d18 \n" /* G */ \ 109 "vqadd.s16 d17, d1, d18 \n" \ 110 "vqshrun.s16 d0, q10, #6 \n" /* B */ \ 111 "vqshrun.s16 d1, q11, #6 \n" /* G */ \ 112 "vqshrun.s16 d2, q8, #6 \n" /* R */ \ 113 "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ 114 "vmovl.u8 q11, d1 \n" \ 115 "vmovl.u8 q8, d2 \n" \ 116 "vtrn.u8 d20, d21 \n" \ 117 "vtrn.u8 d22, d23 \n" \ 118 "vtrn.u8 d16, d17 \n" \ 119 "vmov.u8 d21, d16 \n" 120 121static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, 122 0, 0, 0, 0, 0, 0, 0, 0 }; 123static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, 124 0, 0, 0, 0, 0, 0, 0, 0 }; 125 126#ifdef HAS_I444TOARGBROW_NEON 127void I444ToARGBRow_NEON(const uint8* src_y, 128 const uint8* src_u, 129 const uint8* src_v, 130 uint8* dst_argb, 131 int width) { 132 asm volatile ( 133 MEMACCESS(5) 134 "vld1.8 {d24}, [%5] \n" 135 MEMACCESS(6) 136 "vld1.8 {d25}, [%6] \n" 137 "vmov.u8 d26, #128 \n" 138 "vmov.u16 q14, #74 \n" 139 "vmov.u16 q15, #16 \n" 140 ".p2align 2 \n" 141 "1: \n" 142 READYUV444 143 YUV422TORGB 144 "subs %4, %4, #8 \n" 145 "vmov.u8 d23, #255 \n" 146 MEMACCESS(3) 147 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 148 "bgt 1b \n" 149 : "+r"(src_y), // %0 150 "+r"(src_u), // %1 151 "+r"(src_v), // %2 152 "+r"(dst_argb), // %3 153 "+r"(width) // %4 154 : "r"(&kUVToRB), // %5 155 "r"(&kUVToG) // %6 156 : "cc", "memory", "q0", "q1", "q2", "q3", 157 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 158 ); 159} 160#endif // HAS_I444TOARGBROW_NEON 161 162#ifdef HAS_I422TOARGBROW_NEON 163void I422ToARGBRow_NEON(const uint8* src_y, 164 const uint8* src_u, 165 const uint8* src_v, 166 uint8* dst_argb, 167 int width) { 168 asm volatile ( 169 MEMACCESS(5) 170 "vld1.8 {d24}, [%5] \n" 171 MEMACCESS(6) 172 "vld1.8 {d25}, [%6] \n" 173 "vmov.u8 d26, #128 \n" 174 "vmov.u16 q14, #74 \n" 175 "vmov.u16 q15, #16 \n" 176 ".p2align 2 \n" 177 "1: \n" 178 READYUV422 179 YUV422TORGB 180 "subs %4, %4, #8 \n" 181 "vmov.u8 d23, #255 \n" 182 MEMACCESS(3) 183 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 184 "bgt 1b \n" 185 : "+r"(src_y), // %0 186 "+r"(src_u), // %1 187 "+r"(src_v), // %2 188 "+r"(dst_argb), // %3 189 "+r"(width) // %4 190 : "r"(&kUVToRB), // %5 191 "r"(&kUVToG) // %6 192 : "cc", "memory", "q0", "q1", "q2", "q3", 193 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 194 ); 195} 196#endif // HAS_I422TOARGBROW_NEON 197 198#ifdef HAS_I411TOARGBROW_NEON 199void I411ToARGBRow_NEON(const uint8* src_y, 200 const uint8* src_u, 201 const uint8* src_v, 202 uint8* dst_argb, 203 int width) { 204 asm volatile ( 205 MEMACCESS(5) 206 "vld1.8 {d24}, [%5] \n" 207 MEMACCESS(6) 208 "vld1.8 {d25}, [%6] \n" 209 "vmov.u8 d26, #128 \n" 210 "vmov.u16 q14, #74 \n" 211 "vmov.u16 q15, #16 \n" 212 ".p2align 2 \n" 213 "1: \n" 214 READYUV411 215 YUV422TORGB 216 "subs %4, %4, #8 \n" 217 "vmov.u8 d23, #255 \n" 218 MEMACCESS(3) 219 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 220 "bgt 1b \n" 221 : "+r"(src_y), // %0 222 "+r"(src_u), // %1 223 "+r"(src_v), // %2 224 "+r"(dst_argb), // %3 225 "+r"(width) // %4 226 : "r"(&kUVToRB), // %5 227 "r"(&kUVToG) // %6 228 : "cc", "memory", "q0", "q1", "q2", "q3", 229 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 230 ); 231} 232#endif // HAS_I411TOARGBROW_NEON 233 234#ifdef HAS_I422TOBGRAROW_NEON 235void I422ToBGRARow_NEON(const uint8* src_y, 236 const uint8* src_u, 237 const uint8* src_v, 238 uint8* dst_bgra, 239 int width) { 240 asm volatile ( 241 MEMACCESS(5) 242 "vld1.8 {d24}, [%5] \n" 243 MEMACCESS(6) 244 "vld1.8 {d25}, [%6] \n" 245 "vmov.u8 d26, #128 \n" 246 "vmov.u16 q14, #74 \n" 247 "vmov.u16 q15, #16 \n" 248 ".p2align 2 \n" 249 "1: \n" 250 READYUV422 251 YUV422TORGB 252 "subs %4, %4, #8 \n" 253 "vswp.u8 d20, d22 \n" 254 "vmov.u8 d19, #255 \n" 255 MEMACCESS(3) 256 "vst4.8 {d19, d20, d21, d22}, [%3]! \n" 257 "bgt 1b \n" 258 : "+r"(src_y), // %0 259 "+r"(src_u), // %1 260 "+r"(src_v), // %2 261 "+r"(dst_bgra), // %3 262 "+r"(width) // %4 263 : "r"(&kUVToRB), // %5 264 "r"(&kUVToG) // %6 265 : "cc", "memory", "q0", "q1", "q2", "q3", 266 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 267 ); 268} 269#endif // HAS_I422TOBGRAROW_NEON 270 271#ifdef HAS_I422TOABGRROW_NEON 272void I422ToABGRRow_NEON(const uint8* src_y, 273 const uint8* src_u, 274 const uint8* src_v, 275 uint8* dst_abgr, 276 int width) { 277 asm volatile ( 278 MEMACCESS(5) 279 "vld1.8 {d24}, [%5] \n" 280 MEMACCESS(6) 281 "vld1.8 {d25}, [%6] \n" 282 "vmov.u8 d26, #128 \n" 283 "vmov.u16 q14, #74 \n" 284 "vmov.u16 q15, #16 \n" 285 ".p2align 2 \n" 286 "1: \n" 287 READYUV422 288 YUV422TORGB 289 "subs %4, %4, #8 \n" 290 "vswp.u8 d20, d22 \n" 291 "vmov.u8 d23, #255 \n" 292 MEMACCESS(3) 293 "vst4.8 {d20, d21, d22, d23}, [%3]! \n" 294 "bgt 1b \n" 295 : "+r"(src_y), // %0 296 "+r"(src_u), // %1 297 "+r"(src_v), // %2 298 "+r"(dst_abgr), // %3 299 "+r"(width) // %4 300 : "r"(&kUVToRB), // %5 301 "r"(&kUVToG) // %6 302 : "cc", "memory", "q0", "q1", "q2", "q3", 303 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 304 ); 305} 306#endif // HAS_I422TOABGRROW_NEON 307 308#ifdef HAS_I422TORGBAROW_NEON 309void I422ToRGBARow_NEON(const uint8* src_y, 310 const uint8* src_u, 311 const uint8* src_v, 312 uint8* dst_rgba, 313 int width) { 314 asm volatile ( 315 MEMACCESS(5) 316 "vld1.8 {d24}, [%5] \n" 317 MEMACCESS(6) 318 "vld1.8 {d25}, [%6] \n" 319 "vmov.u8 d26, #128 \n" 320 "vmov.u16 q14, #74 \n" 321 "vmov.u16 q15, #16 \n" 322 ".p2align 2 \n" 323 "1: \n" 324 READYUV422 325 YUV422TORGB 326 "subs %4, %4, #8 \n" 327 "vmov.u8 d19, #255 \n" 328 MEMACCESS(3) 329 "vst4.8 {d19, d20, d21, d22}, [%3]! \n" 330 "bgt 1b \n" 331 : "+r"(src_y), // %0 332 "+r"(src_u), // %1 333 "+r"(src_v), // %2 334 "+r"(dst_rgba), // %3 335 "+r"(width) // %4 336 : "r"(&kUVToRB), // %5 337 "r"(&kUVToG) // %6 338 : "cc", "memory", "q0", "q1", "q2", "q3", 339 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 340 ); 341} 342#endif // HAS_I422TORGBAROW_NEON 343 344#ifdef HAS_I422TORGB24ROW_NEON 345void I422ToRGB24Row_NEON(const uint8* src_y, 346 const uint8* src_u, 347 const uint8* src_v, 348 uint8* dst_rgb24, 349 int width) { 350 asm volatile ( 351 MEMACCESS(5) 352 "vld1.8 {d24}, [%5] \n" 353 MEMACCESS(6) 354 "vld1.8 {d25}, [%6] \n" 355 "vmov.u8 d26, #128 \n" 356 "vmov.u16 q14, #74 \n" 357 "vmov.u16 q15, #16 \n" 358 ".p2align 2 \n" 359 "1: \n" 360 READYUV422 361 YUV422TORGB 362 "subs %4, %4, #8 \n" 363 MEMACCESS(3) 364 "vst3.8 {d20, d21, d22}, [%3]! \n" 365 "bgt 1b \n" 366 : "+r"(src_y), // %0 367 "+r"(src_u), // %1 368 "+r"(src_v), // %2 369 "+r"(dst_rgb24), // %3 370 "+r"(width) // %4 371 : "r"(&kUVToRB), // %5 372 "r"(&kUVToG) // %6 373 : "cc", "memory", "q0", "q1", "q2", "q3", 374 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 375 ); 376} 377#endif // HAS_I422TORGB24ROW_NEON 378 379#ifdef HAS_I422TORAWROW_NEON 380void I422ToRAWRow_NEON(const uint8* src_y, 381 const uint8* src_u, 382 const uint8* src_v, 383 uint8* dst_raw, 384 int width) { 385 asm volatile ( 386 MEMACCESS(5) 387 "vld1.8 {d24}, [%5] \n" 388 MEMACCESS(6) 389 "vld1.8 {d25}, [%6] \n" 390 "vmov.u8 d26, #128 \n" 391 "vmov.u16 q14, #74 \n" 392 "vmov.u16 q15, #16 \n" 393 ".p2align 2 \n" 394 "1: \n" 395 READYUV422 396 YUV422TORGB 397 "subs %4, %4, #8 \n" 398 "vswp.u8 d20, d22 \n" 399 MEMACCESS(3) 400 "vst3.8 {d20, d21, d22}, [%3]! \n" 401 "bgt 1b \n" 402 : "+r"(src_y), // %0 403 "+r"(src_u), // %1 404 "+r"(src_v), // %2 405 "+r"(dst_raw), // %3 406 "+r"(width) // %4 407 : "r"(&kUVToRB), // %5 408 "r"(&kUVToG) // %6 409 : "cc", "memory", "q0", "q1", "q2", "q3", 410 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 411 ); 412} 413#endif // HAS_I422TORAWROW_NEON 414 415#define ARGBTORGB565 \ 416 "vshr.u8 d20, d20, #3 \n" /* B */ \ 417 "vshr.u8 d21, d21, #2 \n" /* G */ \ 418 "vshr.u8 d22, d22, #3 \n" /* R */ \ 419 "vmovl.u8 q8, d20 \n" /* B */ \ 420 "vmovl.u8 q9, d21 \n" /* G */ \ 421 "vmovl.u8 q10, d22 \n" /* R */ \ 422 "vshl.u16 q9, q9, #5 \n" /* G */ \ 423 "vshl.u16 q10, q10, #11 \n" /* R */ \ 424 "vorr q0, q8, q9 \n" /* BG */ \ 425 "vorr q0, q0, q10 \n" /* BGR */ 426 427#ifdef HAS_I422TORGB565ROW_NEON 428void I422ToRGB565Row_NEON(const uint8* src_y, 429 const uint8* src_u, 430 const uint8* src_v, 431 uint8* dst_rgb565, 432 int width) { 433 asm volatile ( 434 MEMACCESS(5) 435 "vld1.8 {d24}, [%5] \n" 436 MEMACCESS(6) 437 "vld1.8 {d25}, [%6] \n" 438 "vmov.u8 d26, #128 \n" 439 "vmov.u16 q14, #74 \n" 440 "vmov.u16 q15, #16 \n" 441 ".p2align 2 \n" 442 "1: \n" 443 READYUV422 444 YUV422TORGB 445 "subs %4, %4, #8 \n" 446 ARGBTORGB565 447 MEMACCESS(3) 448 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. 449 "bgt 1b \n" 450 : "+r"(src_y), // %0 451 "+r"(src_u), // %1 452 "+r"(src_v), // %2 453 "+r"(dst_rgb565), // %3 454 "+r"(width) // %4 455 : "r"(&kUVToRB), // %5 456 "r"(&kUVToG) // %6 457 : "cc", "memory", "q0", "q1", "q2", "q3", 458 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 459 ); 460} 461#endif // HAS_I422TORGB565ROW_NEON 462 463#define ARGBTOARGB1555 \ 464 "vshr.u8 q10, q10, #3 \n" /* B */ \ 465 "vshr.u8 d22, d22, #3 \n" /* R */ \ 466 "vshr.u8 d23, d23, #7 \n" /* A */ \ 467 "vmovl.u8 q8, d20 \n" /* B */ \ 468 "vmovl.u8 q9, d21 \n" /* G */ \ 469 "vmovl.u8 q10, d22 \n" /* R */ \ 470 "vmovl.u8 q11, d23 \n" /* A */ \ 471 "vshl.u16 q9, q9, #5 \n" /* G */ \ 472 "vshl.u16 q10, q10, #10 \n" /* R */ \ 473 "vshl.u16 q11, q11, #15 \n" /* A */ \ 474 "vorr q0, q8, q9 \n" /* BG */ \ 475 "vorr q1, q10, q11 \n" /* RA */ \ 476 "vorr q0, q0, q1 \n" /* BGRA */ 477 478#ifdef HAS_I422TOARGB1555ROW_NEON 479void I422ToARGB1555Row_NEON(const uint8* src_y, 480 const uint8* src_u, 481 const uint8* src_v, 482 uint8* dst_argb1555, 483 int width) { 484 asm volatile ( 485 MEMACCESS(5) 486 "vld1.8 {d24}, [%5] \n" 487 MEMACCESS(6) 488 "vld1.8 {d25}, [%6] \n" 489 "vmov.u8 d26, #128 \n" 490 "vmov.u16 q14, #74 \n" 491 "vmov.u16 q15, #16 \n" 492 ".p2align 2 \n" 493 "1: \n" 494 READYUV422 495 YUV422TORGB 496 "subs %4, %4, #8 \n" 497 "vmov.u8 d23, #255 \n" 498 ARGBTOARGB1555 499 MEMACCESS(3) 500 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. 501 "bgt 1b \n" 502 : "+r"(src_y), // %0 503 "+r"(src_u), // %1 504 "+r"(src_v), // %2 505 "+r"(dst_argb1555), // %3 506 "+r"(width) // %4 507 : "r"(&kUVToRB), // %5 508 "r"(&kUVToG) // %6 509 : "cc", "memory", "q0", "q1", "q2", "q3", 510 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 511 ); 512} 513#endif // HAS_I422TOARGB1555ROW_NEON 514 515#define ARGBTOARGB4444 \ 516 "vshr.u8 d20, d20, #4 \n" /* B */ \ 517 "vbic.32 d21, d21, d4 \n" /* G */ \ 518 "vshr.u8 d22, d22, #4 \n" /* R */ \ 519 "vbic.32 d23, d23, d4 \n" /* A */ \ 520 "vorr d0, d20, d21 \n" /* BG */ \ 521 "vorr d1, d22, d23 \n" /* RA */ \ 522 "vzip.u8 d0, d1 \n" /* BGRA */ 523 524#ifdef HAS_I422TOARGB4444ROW_NEON 525void I422ToARGB4444Row_NEON(const uint8* src_y, 526 const uint8* src_u, 527 const uint8* src_v, 528 uint8* dst_argb4444, 529 int width) { 530 asm volatile ( 531 MEMACCESS(5) 532 "vld1.8 {d24}, [%5] \n" 533 MEMACCESS(6) 534 "vld1.8 {d25}, [%6] \n" 535 "vmov.u8 d26, #128 \n" 536 "vmov.u16 q14, #74 \n" 537 "vmov.u16 q15, #16 \n" 538 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. 539 ".p2align 2 \n" 540 "1: \n" 541 READYUV422 542 YUV422TORGB 543 "subs %4, %4, #8 \n" 544 "vmov.u8 d23, #255 \n" 545 ARGBTOARGB4444 546 MEMACCESS(3) 547 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. 548 "bgt 1b \n" 549 : "+r"(src_y), // %0 550 "+r"(src_u), // %1 551 "+r"(src_v), // %2 552 "+r"(dst_argb4444), // %3 553 "+r"(width) // %4 554 : "r"(&kUVToRB), // %5 555 "r"(&kUVToG) // %6 556 : "cc", "memory", "q0", "q1", "q2", "q3", 557 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 558 ); 559} 560#endif // HAS_I422TOARGB4444ROW_NEON 561 562#ifdef HAS_YTOARGBROW_NEON 563void YToARGBRow_NEON(const uint8* src_y, 564 uint8* dst_argb, 565 int width) { 566 asm volatile ( 567 MEMACCESS(3) 568 "vld1.8 {d24}, [%3] \n" 569 MEMACCESS(4) 570 "vld1.8 {d25}, [%4] \n" 571 "vmov.u8 d26, #128 \n" 572 "vmov.u16 q14, #74 \n" 573 "vmov.u16 q15, #16 \n" 574 ".p2align 2 \n" 575 "1: \n" 576 READYUV400 577 YUV422TORGB 578 "subs %2, %2, #8 \n" 579 "vmov.u8 d23, #255 \n" 580 MEMACCESS(1) 581 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 582 "bgt 1b \n" 583 : "+r"(src_y), // %0 584 "+r"(dst_argb), // %1 585 "+r"(width) // %2 586 : "r"(&kUVToRB), // %3 587 "r"(&kUVToG) // %4 588 : "cc", "memory", "q0", "q1", "q2", "q3", 589 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 590 ); 591} 592#endif // HAS_YTOARGBROW_NEON 593 594#ifdef HAS_I400TOARGBROW_NEON 595void I400ToARGBRow_NEON(const uint8* src_y, 596 uint8* dst_argb, 597 int width) { 598 asm volatile ( 599 ".p2align 2 \n" 600 "vmov.u8 d23, #255 \n" 601 "1: \n" 602 MEMACCESS(0) 603 "vld1.8 {d20}, [%0]! \n" 604 "vmov d21, d20 \n" 605 "vmov d22, d20 \n" 606 "subs %2, %2, #8 \n" 607 MEMACCESS(1) 608 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 609 "bgt 1b \n" 610 : "+r"(src_y), // %0 611 "+r"(dst_argb), // %1 612 "+r"(width) // %2 613 : 614 : "cc", "memory", "d20", "d21", "d22", "d23" 615 ); 616} 617#endif // HAS_I400TOARGBROW_NEON 618 619#ifdef HAS_NV12TOARGBROW_NEON 620void NV12ToARGBRow_NEON(const uint8* src_y, 621 const uint8* src_uv, 622 uint8* dst_argb, 623 int width) { 624 asm volatile ( 625 MEMACCESS(4) 626 "vld1.8 {d24}, [%4] \n" 627 MEMACCESS(5) 628 "vld1.8 {d25}, [%5] \n" 629 "vmov.u8 d26, #128 \n" 630 "vmov.u16 q14, #74 \n" 631 "vmov.u16 q15, #16 \n" 632 ".p2align 2 \n" 633 "1: \n" 634 READNV12 635 YUV422TORGB 636 "subs %3, %3, #8 \n" 637 "vmov.u8 d23, #255 \n" 638 MEMACCESS(2) 639 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 640 "bgt 1b \n" 641 : "+r"(src_y), // %0 642 "+r"(src_uv), // %1 643 "+r"(dst_argb), // %2 644 "+r"(width) // %3 645 : "r"(&kUVToRB), // %4 646 "r"(&kUVToG) // %5 647 : "cc", "memory", "q0", "q1", "q2", "q3", 648 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 649 ); 650} 651#endif // HAS_NV12TOARGBROW_NEON 652 653#ifdef HAS_NV21TOARGBROW_NEON 654void NV21ToARGBRow_NEON(const uint8* src_y, 655 const uint8* src_uv, 656 uint8* dst_argb, 657 int width) { 658 asm volatile ( 659 MEMACCESS(4) 660 "vld1.8 {d24}, [%4] \n" 661 MEMACCESS(5) 662 "vld1.8 {d25}, [%5] \n" 663 "vmov.u8 d26, #128 \n" 664 "vmov.u16 q14, #74 \n" 665 "vmov.u16 q15, #16 \n" 666 ".p2align 2 \n" 667 "1: \n" 668 READNV21 669 YUV422TORGB 670 "subs %3, %3, #8 \n" 671 "vmov.u8 d23, #255 \n" 672 MEMACCESS(2) 673 "vst4.8 {d20, d21, d22, d23}, [%2]! \n" 674 "bgt 1b \n" 675 : "+r"(src_y), // %0 676 "+r"(src_uv), // %1 677 "+r"(dst_argb), // %2 678 "+r"(width) // %3 679 : "r"(&kUVToRB), // %4 680 "r"(&kUVToG) // %5 681 : "cc", "memory", "q0", "q1", "q2", "q3", 682 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 683 ); 684} 685#endif // HAS_NV21TOARGBROW_NEON 686 687#ifdef HAS_NV12TORGB565ROW_NEON 688void NV12ToRGB565Row_NEON(const uint8* src_y, 689 const uint8* src_uv, 690 uint8* dst_rgb565, 691 int width) { 692 asm volatile ( 693 MEMACCESS(4) 694 "vld1.8 {d24}, [%4] \n" 695 MEMACCESS(5) 696 "vld1.8 {d25}, [%5] \n" 697 "vmov.u8 d26, #128 \n" 698 "vmov.u16 q14, #74 \n" 699 "vmov.u16 q15, #16 \n" 700 ".p2align 2 \n" 701 "1: \n" 702 READNV12 703 YUV422TORGB 704 "subs %3, %3, #8 \n" 705 ARGBTORGB565 706 MEMACCESS(2) 707 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. 708 "bgt 1b \n" 709 : "+r"(src_y), // %0 710 "+r"(src_uv), // %1 711 "+r"(dst_rgb565), // %2 712 "+r"(width) // %3 713 : "r"(&kUVToRB), // %4 714 "r"(&kUVToG) // %5 715 : "cc", "memory", "q0", "q1", "q2", "q3", 716 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 717 ); 718} 719#endif // HAS_NV12TORGB565ROW_NEON 720 721#ifdef HAS_NV21TORGB565ROW_NEON 722void NV21ToRGB565Row_NEON(const uint8* src_y, 723 const uint8* src_uv, 724 uint8* dst_rgb565, 725 int width) { 726 asm volatile ( 727 MEMACCESS(4) 728 "vld1.8 {d24}, [%4] \n" 729 MEMACCESS(5) 730 "vld1.8 {d25}, [%5] \n" 731 "vmov.u8 d26, #128 \n" 732 "vmov.u16 q14, #74 \n" 733 "vmov.u16 q15, #16 \n" 734 ".p2align 2 \n" 735 "1: \n" 736 READNV21 737 YUV422TORGB 738 "subs %3, %3, #8 \n" 739 ARGBTORGB565 740 MEMACCESS(2) 741 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. 742 "bgt 1b \n" 743 : "+r"(src_y), // %0 744 "+r"(src_uv), // %1 745 "+r"(dst_rgb565), // %2 746 "+r"(width) // %3 747 : "r"(&kUVToRB), // %4 748 "r"(&kUVToG) // %5 749 : "cc", "memory", "q0", "q1", "q2", "q3", 750 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 751 ); 752} 753#endif // HAS_NV21TORGB565ROW_NEON 754 755#ifdef HAS_YUY2TOARGBROW_NEON 756void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 757 uint8* dst_argb, 758 int width) { 759 asm volatile ( 760 MEMACCESS(3) 761 "vld1.8 {d24}, [%3] \n" 762 MEMACCESS(4) 763 "vld1.8 {d25}, [%4] \n" 764 "vmov.u8 d26, #128 \n" 765 "vmov.u16 q14, #74 \n" 766 "vmov.u16 q15, #16 \n" 767 ".p2align 2 \n" 768 "1: \n" 769 READYUY2 770 YUV422TORGB 771 "subs %2, %2, #8 \n" 772 "vmov.u8 d23, #255 \n" 773 MEMACCESS(1) 774 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 775 "bgt 1b \n" 776 : "+r"(src_yuy2), // %0 777 "+r"(dst_argb), // %1 778 "+r"(width) // %2 779 : "r"(&kUVToRB), // %3 780 "r"(&kUVToG) // %4 781 : "cc", "memory", "q0", "q1", "q2", "q3", 782 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 783 ); 784} 785#endif // HAS_YUY2TOARGBROW_NEON 786 787#ifdef HAS_UYVYTOARGBROW_NEON 788void UYVYToARGBRow_NEON(const uint8* src_uyvy, 789 uint8* dst_argb, 790 int width) { 791 asm volatile ( 792 MEMACCESS(3) 793 "vld1.8 {d24}, [%3] \n" 794 MEMACCESS(4) 795 "vld1.8 {d25}, [%4] \n" 796 "vmov.u8 d26, #128 \n" 797 "vmov.u16 q14, #74 \n" 798 "vmov.u16 q15, #16 \n" 799 ".p2align 2 \n" 800 "1: \n" 801 READUYVY 802 YUV422TORGB 803 "subs %2, %2, #8 \n" 804 "vmov.u8 d23, #255 \n" 805 MEMACCESS(1) 806 "vst4.8 {d20, d21, d22, d23}, [%1]! \n" 807 "bgt 1b \n" 808 : "+r"(src_uyvy), // %0 809 "+r"(dst_argb), // %1 810 "+r"(width) // %2 811 : "r"(&kUVToRB), // %3 812 "r"(&kUVToG) // %4 813 : "cc", "memory", "q0", "q1", "q2", "q3", 814 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 815 ); 816} 817#endif // HAS_UYVYTOARGBROW_NEON 818 819// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 820#ifdef HAS_SPLITUVROW_NEON 821void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 822 int width) { 823 asm volatile ( 824 ".p2align 2 \n" 825 "1: \n" 826 MEMACCESS(0) 827 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV 828 "subs %3, %3, #16 \n" // 16 processed per loop 829 MEMACCESS(1) 830 "st1 {v0.16b}, [%1], #16 \n" // store U 831 MEMACCESS(2) 832 "st1 {v1.16b}, [%2], #16 \n" // store V 833 "bgt 1b \n" 834 : "+r"(src_uv), // %0 835 "+r"(dst_u), // %1 836 "+r"(dst_v), // %2 837 "+r"(width) // %3 // Output registers 838 : // Input registers 839 : "cc", "memory", "v0", "v1" // Clobber List 840 ); 841} 842#endif // HAS_SPLITUVROW_NEON 843 844// Reads 16 U's and V's and writes out 16 pairs of UV. 845#ifdef HAS_MERGEUVROW_NEON 846void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 847 int width) { 848 asm volatile ( 849 ".p2align 2 \n" 850 "1: \n" 851 MEMACCESS(0) 852 "ld1 {v0.16b}, [%0], #16 \n" // load U 853 MEMACCESS(1) 854 "ld1 {v1.16b}, [%1], #16 \n" // load V 855 "subs %3, %3, #16 \n" // 16 processed per loop 856 MEMACCESS(2) 857 "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV 858 "bgt 1b \n" 859 : 860 "+r"(src_u), // %0 861 "+r"(src_v), // %1 862 "+r"(dst_uv), // %2 863 "+r"(width) // %3 // Output registers 864 : // Input registers 865 : "cc", "memory", "v0", "v1" // Clobber List 866 ); 867} 868#endif // HAS_MERGEUVROW_NEON 869 870// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 871#ifdef HAS_COPYROW_NEON 872void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 873 asm volatile ( 874 ".p2align 2 \n" 875 "1: \n" 876 MEMACCESS(0) 877 "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 878 "subs %2, %2, #32 \n" // 32 processed per loop 879 MEMACCESS(1) 880 "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 881 "bgt 1b \n" 882 : "+r"(src), // %0 883 "+r"(dst), // %1 884 "+r"(count) // %2 // Output registers 885 : // Input registers 886 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 887 ); 888} 889#endif // HAS_COPYROW_NEON 890 891// SetRow8 writes 'count' bytes using a 32 bit value repeated. 892#ifdef HAS_SETROW_NEON 893void SetRow_NEON(uint8* dst, uint32 v32, int count) { 894 asm volatile ( 895 "dup v0.4s, %w2 \n" // duplicate 4 ints 896 "1: \n" 897 "subs %1, %1, #16 \n" // 16 bytes per loop 898 MEMACCESS(0) 899 "st1 {v0.16b}, [%0], #16 \n" // store 900 "bgt 1b \n" 901 : "+r"(dst), // %0 902 "+r"(count) // %1 903 : "r"(v32) // %2 904 : "cc", "memory", "v0" 905 ); 906} 907#endif // HAS_SETROW_NEON 908 909// TODO(fbarchard): Make fully assembler 910// SetRow32 writes 'count' words using a 32 bit value repeated. 911#ifdef HAS_ARGBSETROWS_NEON 912void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, 913 int dst_stride, int height) { 914 for (int y = 0; y < height; ++y) { 915 SetRow_NEON(dst, v32, width << 2); 916 dst += dst_stride; 917 } 918} 919#endif // HAS_ARGBSETROWS_NEON 920 921#ifdef HAS_MIRRORROW_NEON 922void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 923 asm volatile ( 924 // Start at end of source row. 925 "add %0, %0, %2 \n" 926 "sub %0, %0, #16 \n" 927 928 ".p2align 2 \n" 929 "1: \n" 930 MEMACCESS(0) 931 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 932 "subs %2, %2, #16 \n" // 16 pixels per loop. 933 "rev64 v0.16b, v0.16b \n" 934 MEMACCESS(1) 935 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 936 MEMACCESS(1) 937 "st1 {v0.D}[0], [%1], #8 \n" 938 "bgt 1b \n" 939 : "+r"(src), // %0 940 "+r"(dst), // %1 941 "+r"(width) // %2 942 : "r"((ptrdiff_t)-16) // %3 943 : "cc", "memory", "v0" 944 ); 945} 946#endif // HAS_MIRRORROW_NEON 947 948#ifdef HAS_MIRRORUVROW_NEON 949void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 950 int width) { 951 asm volatile ( 952 // Start at end of source row. 953 "add %0, %0, %3, lsl #1 \n" 954 "sub %0, %0, #16 \n" 955 956 ".p2align 2 \n" 957 "1: \n" 958 MEMACCESS(0) 959 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 960 "subs %3, %3, #8 \n" // 8 pixels per loop. 961 "rev64 v0.8b, v0.8b \n" 962 "rev64 v1.8b, v1.8b \n" 963 MEMACCESS(1) 964 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 965 MEMACCESS(2) 966 "st1 {v1.8b}, [%2], #8 \n" 967 "bgt 1b \n" 968 : "+r"(src_uv), // %0 969 "+r"(dst_u), // %1 970 "+r"(dst_v), // %2 971 "+r"(width) // %3 972 : "r"((ptrdiff_t)-16) // %4 973 : "cc", "memory", "v0", "v1" 974 ); 975} 976#endif // HAS_MIRRORUVROW_NEON 977 978#ifdef HAS_ARGBMIRRORROW_NEON 979void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 980 asm volatile ( 981 // Start at end of source row. 982 "add %0, %0, %2, lsl #2 \n" 983 "sub %0, %0, #16 \n" 984 985 ".p2align 2 \n" 986 "1: \n" 987 MEMACCESS(0) 988 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 989 "subs %2, %2, #4 \n" // 4 pixels per loop. 990 "rev64 v0.4s, v0.4s \n" 991 MEMACCESS(1) 992 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 993 MEMACCESS(1) 994 "st1 {v0.D}[0], [%1], #8 \n" 995 "bgt 1b \n" 996 : "+r"(src), // %0 997 "+r"(dst), // %1 998 "+r"(width) // %2 999 : "r"((ptrdiff_t)-16) // %3 1000 : "cc", "memory", "v0" 1001 ); 1002} 1003#endif // HAS_ARGBMIRRORROW_NEON 1004 1005#ifdef HAS_RGB24TOARGBROW_NEON 1006void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 1007 asm volatile ( 1008 "movi v4.8b, #255 \n" // Alpha 1009 ".p2align 2 \n" 1010 "1: \n" 1011 MEMACCESS(0) 1012 "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 1013 "subs %2, %2, #8 \n" // 8 processed per loop. 1014 MEMACCESS(1) 1015 "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. 1016 "bgt 1b \n" 1017 : "+r"(src_rgb24), // %0 1018 "+r"(dst_argb), // %1 1019 "+r"(pix) // %2 1020 : 1021 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1022 ); 1023} 1024#endif // HAS_RGB24TOARGBROW_NEON 1025 1026#ifdef HAS_RAWTOARGBROW_NEON 1027void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 1028 asm volatile ( 1029 "movi v5.8b, #255 \n" // Alpha 1030 ".p2align 2 \n" 1031 "1: \n" 1032 MEMACCESS(0) 1033 "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b 1034 "subs %2, %2, #8 \n" // 8 processed per loop. 1035 "mov v3.8b, v1.8b \n" // move g 1036 "mov v4.8b, v0.8b \n" // move r 1037 MEMACCESS(1) 1038 "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a 1039 "bgt 1b \n" 1040 : "+r"(src_raw), // %0 1041 "+r"(dst_argb), // %1 1042 "+r"(pix) // %2 1043 : 1044 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 1045 ); 1046} 1047#endif // HAS_RAWTOARGBROW_NEON 1048 1049#define RGB565TOARGB \ 1050 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ 1051 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ 1052 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ 1053 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ 1054 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ 1055 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ 1056 "vorr.u8 d0, d0, d4 \n" /* B */ \ 1057 "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ 1058 "vorr.u8 d2, d1, d5 \n" /* R */ \ 1059 "vorr.u8 d1, d4, d6 \n" /* G */ 1060 1061#ifdef HAS_RGB565TOARGBROW_NEON 1062void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { 1063 asm volatile ( 1064 "vmov.u8 d3, #255 \n" // Alpha 1065 ".p2align 2 \n" 1066 "1: \n" 1067 MEMACCESS(0) 1068 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 1069 "subs %2, %2, #8 \n" // 8 processed per loop. 1070 RGB565TOARGB 1071 MEMACCESS(1) 1072 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 1073 "bgt 1b \n" 1074 : "+r"(src_rgb565), // %0 1075 "+r"(dst_argb), // %1 1076 "+r"(pix) // %2 1077 : 1078 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List 1079 ); 1080} 1081#endif // HAS_RGB565TOARGBROW_NEON 1082 1083#define ARGB1555TOARGB \ 1084 "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ 1085 "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ 1086 "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ 1087 "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ 1088 "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ 1089 "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ 1090 "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ 1091 "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ 1092 "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ 1093 "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ 1094 "vorr.u8 q1, q1, q3 \n" /* R,A */ \ 1095 "vorr.u8 q0, q0, q2 \n" /* B,G */ \ 1096 1097// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 1098#define RGB555TOARGB \ 1099 "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ 1100 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ 1101 "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ 1102 "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ 1103 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ 1104 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ 1105 "vorr.u8 d0, d0, d4 \n" /* B */ \ 1106 "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ 1107 "vorr.u8 d2, d1, d5 \n" /* R */ \ 1108 "vorr.u8 d1, d4, d6 \n" /* G */ 1109 1110#ifdef HAS_ARGB1555TOARGBROW_NEON 1111void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 1112 int pix) { 1113 asm volatile ( 1114 "vmov.u8 d3, #255 \n" // Alpha 1115 ".p2align 2 \n" 1116 "1: \n" 1117 MEMACCESS(0) 1118 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 1119 "subs %2, %2, #8 \n" // 8 processed per loop. 1120 ARGB1555TOARGB 1121 MEMACCESS(1) 1122 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 1123 "bgt 1b \n" 1124 : "+r"(src_argb1555), // %0 1125 "+r"(dst_argb), // %1 1126 "+r"(pix) // %2 1127 : 1128 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List 1129 ); 1130} 1131#endif // HAS_ARGB1555TOARGBROW_NEON 1132 1133#define ARGB4444TOARGB \ 1134 "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ 1135 "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ 1136 "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ 1137 "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ 1138 "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ 1139 "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ 1140 "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ 1141 "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ 1142 1143#ifdef HAS_ARGB4444TOARGBROW_NEON 1144void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 1145 int pix) { 1146 asm volatile ( 1147 "vmov.u8 d3, #255 \n" // Alpha 1148 ".p2align 2 \n" 1149 "1: \n" 1150 MEMACCESS(0) 1151 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 1152 "subs %2, %2, #8 \n" // 8 processed per loop. 1153 ARGB4444TOARGB 1154 MEMACCESS(1) 1155 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 1156 "bgt 1b \n" 1157 : "+r"(src_argb4444), // %0 1158 "+r"(dst_argb), // %1 1159 "+r"(pix) // %2 1160 : 1161 : "cc", "memory", "q0", "q1", "q2" // Clobber List 1162 ); 1163} 1164#endif // HAS_ARGB4444TOARGBROW_NEON 1165 1166#ifdef HAS_ARGBTORGB24ROW_NEON 1167void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1168 asm volatile ( 1169 ".p2align 2 \n" 1170 "1: \n" 1171 MEMACCESS(0) 1172 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. 1173 "subs %2, %2, #8 \n" // 8 processed per loop. 1174 MEMACCESS(1) 1175 "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 1176 "bgt 1b \n" 1177 : "+r"(src_argb), // %0 1178 "+r"(dst_rgb24), // %1 1179 "+r"(pix) // %2 1180 : 1181 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1182 ); 1183} 1184#endif // HAS_ARGBTORGB24ROW_NEON 1185 1186#ifdef HAS_ARGBTORAWROW_NEON 1187void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1188 asm volatile ( 1189 ".p2align 2 \n" 1190 "1: \n" 1191 MEMACCESS(0) 1192 "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a 1193 "subs %2, %2, #8 \n" // 8 processed per loop. 1194 "mov v4.8b, v2.8b \n" // mov g 1195 "mov v5.8b, v1.8b \n" // mov b 1196 MEMACCESS(1) 1197 "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b 1198 "bgt 1b \n" 1199 : "+r"(src_argb), // %0 1200 "+r"(dst_raw), // %1 1201 "+r"(pix) // %2 1202 : 1203 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 1204 ); 1205} 1206#endif // HAS_ARGBTORAWROW_NEON 1207 1208#ifdef HAS_YUY2TOYROW_NEON 1209void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1210 asm volatile ( 1211 ".p2align 2 \n" 1212 "1: \n" 1213 MEMACCESS(0) 1214 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1215 "subs %2, %2, #16 \n" // 16 processed per loop. 1216 MEMACCESS(1) 1217 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1218 "bgt 1b \n" 1219 : "+r"(src_yuy2), // %0 1220 "+r"(dst_y), // %1 1221 "+r"(pix) // %2 1222 : 1223 : "cc", "memory", "v0", "v1" // Clobber List 1224 ); 1225} 1226#endif // HAS_YUY2TOYROW_NEON 1227 1228#ifdef HAS_UYVYTOYROW_NEON 1229void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1230 asm volatile ( 1231 ".p2align 2 \n" 1232 "1: \n" 1233 MEMACCESS(0) 1234 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1235 "subs %2, %2, #16 \n" // 16 processed per loop. 1236 MEMACCESS(1) 1237 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1238 "bgt 1b \n" 1239 : "+r"(src_uyvy), // %0 1240 "+r"(dst_y), // %1 1241 "+r"(pix) // %2 1242 : 1243 : "cc", "memory", "v0", "v1" // Clobber List 1244 ); 1245} 1246#endif // HAS_UYVYTOYROW_NEON 1247 1248#ifdef HAS_YUY2TOUV422ROW_NEON 1249void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1250 int pix) { 1251 asm volatile ( 1252 ".p2align 2 \n" 1253 "1: \n" 1254 MEMACCESS(0) 1255 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. 1256 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1257 MEMACCESS(1) 1258 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1259 MEMACCESS(2) 1260 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1261 "bgt 1b \n" 1262 : "+r"(src_yuy2), // %0 1263 "+r"(dst_u), // %1 1264 "+r"(dst_v), // %2 1265 "+r"(pix) // %3 1266 : 1267 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1268 ); 1269} 1270#endif // HAS_YUY2TOUV422ROW_NEON 1271 1272#ifdef HAS_UYVYTOUV422ROW_NEON 1273void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1274 int pix) { 1275 asm volatile ( 1276 ".p2align 2 \n" 1277 "1: \n" 1278 MEMACCESS(0) 1279 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. 1280 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. 1281 MEMACCESS(1) 1282 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1283 MEMACCESS(2) 1284 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1285 "bgt 1b \n" 1286 : "+r"(src_uyvy), // %0 1287 "+r"(dst_u), // %1 1288 "+r"(dst_v), // %2 1289 "+r"(pix) // %3 1290 : 1291 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1292 ); 1293} 1294#endif // HAS_UYVYTOUV422ROW_NEON 1295 1296#ifdef HAS_YUY2TOUVROW_NEON 1297void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1298 uint8* dst_u, uint8* dst_v, int pix) { 1299 asm volatile ( 1300 "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 1301 ".p2align 2 \n" 1302 "1: \n" 1303 MEMACCESS(0) 1304 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. 1305 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1306 MEMACCESS(1) 1307 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. 1308 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1309 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1310 MEMACCESS(2) 1311 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1312 MEMACCESS(3) 1313 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1314 "bgt 1b \n" 1315 : "+r"(src_yuy2), // %0 1316 "+r"(stride_yuy2), // %1 1317 "+r"(dst_u), // %2 1318 "+r"(dst_v), // %3 1319 "+r"(pix) // %4 1320 : 1321 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List 1322 ); 1323} 1324#endif // HAS_YUY2TOUVROW_NEON 1325 1326#ifdef HAS_UYVYTOUVROW_NEON 1327void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1328 uint8* dst_u, uint8* dst_v, int pix) { 1329 asm volatile ( 1330 "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy 1331 ".p2align 2 \n" 1332 "1: \n" 1333 MEMACCESS(0) 1334 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. 1335 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. 1336 MEMACCESS(1) 1337 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. 1338 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1339 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1340 MEMACCESS(2) 1341 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1342 MEMACCESS(3) 1343 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1344 "bgt 1b \n" 1345 : "+r"(src_uyvy), // %0 1346 "+r"(stride_uyvy), // %1 1347 "+r"(dst_u), // %2 1348 "+r"(dst_v), // %3 1349 "+r"(pix) // %4 1350 : 1351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List 1352 ); 1353} 1354#endif // HAS_UYVYTOUVROW_NEON 1355 1356#ifdef HAS_HALFROW_NEON 1357void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, 1358 uint8* dst_uv, int pix) { 1359 asm volatile ( 1360 // change the stride to row 2 pointer 1361 "add %x1, %x0, %w1, sxtw \n" 1362 "1: \n" 1363 MEMACCESS(0) 1364 "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. 1365 "subs %3, %3, #16 \n" // 16 processed per loop 1366 MEMACCESS(1) 1367 "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. 1368 "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 1369 MEMACCESS(2) 1370 "st1 {v0.16b}, [%2], #16 \n" 1371 "bgt 1b \n" 1372 : "+r"(src_uv), // %0 1373 "+r"(src_uv_stride), // %1 1374 "+r"(dst_uv), // %2 1375 "+r"(pix) // %3 1376 : 1377 : "cc", "memory", "v0", "v1" // Clobber List 1378 ); 1379} 1380#endif // HAS_HALFROW_NEON 1381 1382// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG 1383#ifdef HAS_ARGBTOBAYERROW_NEON 1384void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, 1385 uint32 selector, int pix) { 1386 asm volatile ( 1387 "mov v2.s[0], %w3 \n" // selector 1388 "1: \n" 1389 MEMACCESS(0) 1390 "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. 1391 "subs %2, %2, #8 \n" // 8 processed per loop 1392 "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels 1393 "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels 1394 "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels 1395 MEMACCESS(1) 1396 "st1 {v4.8b}, [%1], #8 \n" // store 8. 1397 "bgt 1b \n" 1398 : "+r"(src_argb), // %0 1399 "+r"(dst_bayer), // %1 1400 "+r"(pix) // %2 1401 : "r"(selector) // %3 1402 : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List 1403 ); 1404} 1405#endif // HAS_ARGBTOBAYERROW_NEON 1406 1407// Select G channels from ARGB. e.g. GGGGGGGG 1408#ifdef HAS_ARGBTOBAYERGGROW_NEON 1409void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, 1410 uint32 /*selector*/, int pix) { 1411 asm volatile ( 1412 "1: \n" 1413 MEMACCESS(0) 1414 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. 1415 "subs %2, %2, #8 \n" // 8 processed per loop 1416 MEMACCESS(1) 1417 "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. 1418 "bgt 1b \n" 1419 : "+r"(src_argb), // %0 1420 "+r"(dst_bayer), // %1 1421 "+r"(pix) // %2 1422 : 1423 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1424 ); 1425} 1426#endif // HAS_ARGBTOBAYERGGROW_NEON 1427 1428// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1429#ifdef HAS_ARGBSHUFFLEROW_NEON 1430void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1431 const uint8* shuffler, int pix) { 1432 asm volatile ( 1433 MEMACCESS(3) 1434 "ld1 {v2.16b}, [%3] \n" // shuffler 1435 "1: \n" 1436 MEMACCESS(0) 1437 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1438 "subs %2, %2, #4 \n" // 4 processed per loop 1439 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1440 MEMACCESS(1) 1441 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1442 "bgt 1b \n" 1443 : "+r"(src_argb), // %0 1444 "+r"(dst_argb), // %1 1445 "+r"(pix) // %2 1446 : "r"(shuffler) // %3 1447 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1448 ); 1449} 1450#endif // HAS_ARGBSHUFFLEROW_NEON 1451 1452#ifdef HAS_I422TOYUY2ROW_NEON 1453void I422ToYUY2Row_NEON(const uint8* src_y, 1454 const uint8* src_u, 1455 const uint8* src_v, 1456 uint8* dst_yuy2, int width) { 1457 asm volatile ( 1458 ".p2align 2 \n" 1459 "1: \n" 1460 MEMACCESS(0) 1461 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1462 "mov v2.8b, v1.8b \n" 1463 MEMACCESS(1) 1464 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1465 MEMACCESS(2) 1466 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1467 "subs %4, %4, #16 \n" // 16 pixels 1468 MEMACCESS(3) 1469 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. 1470 "bgt 1b \n" 1471 : "+r"(src_y), // %0 1472 "+r"(src_u), // %1 1473 "+r"(src_v), // %2 1474 "+r"(dst_yuy2), // %3 1475 "+r"(width) // %4 1476 : 1477 : "cc", "memory", "v0", "v1", "v2", "v3" 1478 ); 1479} 1480#endif // HAS_I422TOYUY2ROW_NEON 1481 1482#ifdef HAS_I422TOUYVYROW_NEON 1483void I422ToUYVYRow_NEON(const uint8* src_y, 1484 const uint8* src_u, 1485 const uint8* src_v, 1486 uint8* dst_uyvy, int width) { 1487 asm volatile ( 1488 ".p2align 2 \n" 1489 "1: \n" 1490 MEMACCESS(0) 1491 "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys 1492 "mov v3.8b, v2.8b \n" 1493 MEMACCESS(1) 1494 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1495 MEMACCESS(2) 1496 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1497 "subs %4, %4, #16 \n" // 16 pixels 1498 MEMACCESS(3) 1499 "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. 1500 "bgt 1b \n" 1501 : "+r"(src_y), // %0 1502 "+r"(src_u), // %1 1503 "+r"(src_v), // %2 1504 "+r"(dst_uyvy), // %3 1505 "+r"(width) // %4 1506 : 1507 : "cc", "memory", "v0", "v1", "v2", "v3" 1508 ); 1509} 1510#endif // HAS_I422TOUYVYROW_NEON 1511 1512#ifdef HAS_ARGBTORGB565ROW_NEON 1513void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1514 asm volatile ( 1515 ".p2align 2 \n" 1516 "1: \n" 1517 MEMACCESS(0) 1518 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1519 "subs %2, %2, #8 \n" // 8 processed per loop. 1520 ARGBTORGB565 1521 MEMACCESS(1) 1522 "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. 1523 "bgt 1b \n" 1524 : "+r"(src_argb), // %0 1525 "+r"(dst_rgb565), // %1 1526 "+r"(pix) // %2 1527 : 1528 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1529 ); 1530} 1531#endif // HAS_ARGBTORGB565ROW_NEON 1532 1533#ifdef HAS_ARGBTOARGB1555ROW_NEON 1534void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1535 int pix) { 1536 asm volatile ( 1537 ".p2align 2 \n" 1538 "1: \n" 1539 MEMACCESS(0) 1540 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1541 "subs %2, %2, #8 \n" // 8 processed per loop. 1542 ARGBTOARGB1555 1543 MEMACCESS(1) 1544 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. 1545 "bgt 1b \n" 1546 : "+r"(src_argb), // %0 1547 "+r"(dst_argb1555), // %1 1548 "+r"(pix) // %2 1549 : 1550 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1551 ); 1552} 1553#endif // HAS_ARGBTOARGB1555ROW_NEON 1554 1555#ifdef HAS_ARGBTOARGB4444ROW_NEON 1556void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1557 int pix) { 1558 asm volatile ( 1559 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. 1560 ".p2align 2 \n" 1561 "1: \n" 1562 MEMACCESS(0) 1563 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. 1564 "subs %2, %2, #8 \n" // 8 processed per loop. 1565 ARGBTOARGB4444 1566 MEMACCESS(1) 1567 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. 1568 "bgt 1b \n" 1569 : "+r"(src_argb), // %0 1570 "+r"(dst_argb4444), // %1 1571 "+r"(pix) // %2 1572 : 1573 : "cc", "memory", "q0", "q8", "q9", "q10", "q11" 1574 ); 1575} 1576#endif // HAS_ARGBTOARGB4444ROW_NEON 1577 1578#ifdef HAS_ARGBTOYROW_NEON 1579void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1580 asm volatile ( 1581 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1582 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1583 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1584 "movi v7.8b, #16 \n" // Add 16 constant 1585 ".p2align 2 \n" 1586 "1: \n" 1587 MEMACCESS(0) 1588 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1589 "subs %2, %2, #8 \n" // 8 processed per loop. 1590 "umull v3.8h, v0.8b, v4.8b \n" // B 1591 "umlal v3.8h, v1.8b, v5.8b \n" // G 1592 "umlal v3.8h, v2.8b, v6.8b \n" // R 1593 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1594 "uqadd v0.8b, v0.8b, v7.8b \n" 1595 MEMACCESS(1) 1596 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1597 "bgt 1b \n" 1598 : "+r"(src_argb), // %0 1599 "+r"(dst_y), // %1 1600 "+r"(pix) // %2 1601 : 1602 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1603 ); 1604} 1605#endif // HAS_ARGBTOYROW_NEON 1606 1607#ifdef HAS_ARGBTOYJROW_NEON 1608void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1609 asm volatile ( 1610 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1611 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1612 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1613 ".p2align 2 \n" 1614 "1: \n" 1615 MEMACCESS(0) 1616 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1617 "subs %2, %2, #8 \n" // 8 processed per loop. 1618 "umull v3.8h, v0.8b, v4.8b \n" // B 1619 "umlal v3.8h, v1.8b, v5.8b \n" // G 1620 "umlal v3.8h, v2.8b, v6.8b \n" // R 1621 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1622 MEMACCESS(1) 1623 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1624 "bgt 1b \n" 1625 : "+r"(src_argb), // %0 1626 "+r"(dst_y), // %1 1627 "+r"(pix) // %2 1628 : 1629 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1630 ); 1631} 1632#endif // HAS_ARGBTOYJROW_NEON 1633 1634// 8x1 pixels. 1635#ifdef HAS_ARGBTOUV444ROW_NEON 1636void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1637 int pix) { 1638 asm volatile ( 1639 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient 1640 "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient 1641 "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient 1642 "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient 1643 "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient 1644 "vmov.u16 q15, #0x8080 \n" // 128.5 1645 ".p2align 2 \n" 1646 "1: \n" 1647 MEMACCESS(0) 1648 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 1649 "subs %3, %3, #8 \n" // 8 processed per loop. 1650 "vmull.u8 q2, d0, d24 \n" // B 1651 "vmlsl.u8 q2, d1, d25 \n" // G 1652 "vmlsl.u8 q2, d2, d26 \n" // R 1653 "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned 1654 1655 "vmull.u8 q3, d2, d24 \n" // R 1656 "vmlsl.u8 q3, d1, d28 \n" // G 1657 "vmlsl.u8 q3, d0, d27 \n" // B 1658 "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned 1659 1660 "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U 1661 "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V 1662 1663 MEMACCESS(1) 1664 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1665 MEMACCESS(2) 1666 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1667 "bgt 1b \n" 1668 : "+r"(src_argb), // %0 1669 "+r"(dst_u), // %1 1670 "+r"(dst_v), // %2 1671 "+r"(pix) // %3 1672 : 1673 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" 1674 ); 1675} 1676#endif // HAS_ARGBTOUV444ROW_NEON 1677 1678// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1679#ifdef HAS_ARGBTOUV422ROW_NEON 1680void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1681 int pix) { 1682 asm volatile ( 1683 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1684 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1685 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1686 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1687 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1688 "vmov.u16 q15, #0x8080 \n" // 128.5 1689 ".p2align 2 \n" 1690 "1: \n" 1691 MEMACCESS(0) 1692 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1693 MEMACCESS(0) 1694 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1695 1696 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1697 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1698 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1699 1700 "subs %3, %3, #16 \n" // 16 processed per loop. 1701 "vmul.s16 q8, q0, q10 \n" // B 1702 "vmls.s16 q8, q1, q11 \n" // G 1703 "vmls.s16 q8, q2, q12 \n" // R 1704 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 1705 1706 "vmul.s16 q9, q2, q10 \n" // R 1707 "vmls.s16 q9, q1, q14 \n" // G 1708 "vmls.s16 q9, q0, q13 \n" // B 1709 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 1710 1711 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 1712 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 1713 1714 MEMACCESS(1) 1715 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1716 MEMACCESS(2) 1717 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1718 "bgt 1b \n" 1719 : "+r"(src_argb), // %0 1720 "+r"(dst_u), // %1 1721 "+r"(dst_v), // %2 1722 "+r"(pix) // %3 1723 : 1724 : "cc", "memory", "q0", "q1", "q2", "q3", 1725 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1726 ); 1727} 1728#endif // HAS_ARGBTOUV422ROW_NEON 1729 1730// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. 1731#ifdef HAS_ARGBTOUV411ROW_NEON 1732void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1733 int pix) { 1734 asm volatile ( 1735 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1736 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1737 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1738 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1739 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1740 "vmov.u16 q15, #0x8080 \n" // 128.5 1741 ".p2align 2 \n" 1742 "1: \n" 1743 MEMACCESS(0) 1744 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1745 MEMACCESS(0) 1746 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1747 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1748 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1749 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1750 MEMACCESS(0) 1751 "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. 1752 MEMACCESS(0) 1753 "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. 1754 "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. 1755 "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. 1756 "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. 1757 1758 "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. 1759 "vpadd.u16 d1, d8, d9 \n" // B 1760 "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. 1761 "vpadd.u16 d3, d10, d11 \n" // G 1762 "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. 1763 "vpadd.u16 d5, d12, d13 \n" // R 1764 1765 "vrshr.u16 q0, q0, #1 \n" // 2x average 1766 "vrshr.u16 q1, q1, #1 \n" 1767 "vrshr.u16 q2, q2, #1 \n" 1768 1769 "subs %3, %3, #32 \n" // 32 processed per loop. 1770 "vmul.s16 q8, q0, q10 \n" // B 1771 "vmls.s16 q8, q1, q11 \n" // G 1772 "vmls.s16 q8, q2, q12 \n" // R 1773 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 1774 "vmul.s16 q9, q2, q10 \n" // R 1775 "vmls.s16 q9, q1, q14 \n" // G 1776 "vmls.s16 q9, q0, q13 \n" // B 1777 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 1778 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 1779 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 1780 MEMACCESS(1) 1781 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. 1782 MEMACCESS(2) 1783 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. 1784 "bgt 1b \n" 1785 : "+r"(src_argb), // %0 1786 "+r"(dst_u), // %1 1787 "+r"(dst_v), // %2 1788 "+r"(pix) // %3 1789 : 1790 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1791 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1792 ); 1793} 1794#endif // HAS_ARGBTOUV411ROW_NEON 1795 1796// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1797#define RGBTOUV(QB, QG, QR) \ 1798 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ 1799 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ 1800 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ 1801 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ 1802 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ 1803 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ 1804 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ 1805 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ 1806 "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ 1807 "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ 1808 1809// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1810#ifdef HAS_ARGBTOUVROW_NEON 1811void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1812 uint8* dst_u, uint8* dst_v, int pix) { 1813 asm volatile ( 1814 "add %1, %0, %1 \n" // src_stride + src_argb 1815 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1816 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1817 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1818 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1819 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1820 "vmov.u16 q15, #0x8080 \n" // 128.5 1821 ".p2align 2 \n" 1822 "1: \n" 1823 MEMACCESS(0) 1824 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1825 MEMACCESS(0) 1826 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1827 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1828 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1829 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1830 MEMACCESS(1) 1831 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. 1832 MEMACCESS(1) 1833 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. 1834 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 1835 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. 1836 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. 1837 1838 "vrshr.u16 q0, q0, #1 \n" // 2x average 1839 "vrshr.u16 q1, q1, #1 \n" 1840 "vrshr.u16 q2, q2, #1 \n" 1841 1842 "subs %4, %4, #16 \n" // 32 processed per loop. 1843 RGBTOUV(q0, q1, q2) 1844 MEMACCESS(2) 1845 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1846 MEMACCESS(3) 1847 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1848 "bgt 1b \n" 1849 : "+r"(src_argb), // %0 1850 "+r"(src_stride_argb), // %1 1851 "+r"(dst_u), // %2 1852 "+r"(dst_v), // %3 1853 "+r"(pix) // %4 1854 : 1855 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1856 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1857 ); 1858} 1859#endif // HAS_ARGBTOUVROW_NEON 1860 1861// TODO(fbarchard): Subsample match C code. 1862#ifdef HAS_ARGBTOUVJROW_NEON 1863void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1864 uint8* dst_u, uint8* dst_v, int pix) { 1865 asm volatile ( 1866 "add %1, %0, %1 \n" // src_stride + src_argb 1867 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient 1868 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient 1869 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient 1870 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient 1871 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient 1872 "vmov.u16 q15, #0x8080 \n" // 128.5 1873 ".p2align 2 \n" 1874 "1: \n" 1875 MEMACCESS(0) 1876 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1877 MEMACCESS(0) 1878 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1879 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1880 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1881 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1882 MEMACCESS(1) 1883 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. 1884 MEMACCESS(1) 1885 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. 1886 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 1887 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. 1888 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. 1889 1890 "vrshr.u16 q0, q0, #1 \n" // 2x average 1891 "vrshr.u16 q1, q1, #1 \n" 1892 "vrshr.u16 q2, q2, #1 \n" 1893 1894 "subs %4, %4, #16 \n" // 32 processed per loop. 1895 RGBTOUV(q0, q1, q2) 1896 MEMACCESS(2) 1897 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1898 MEMACCESS(3) 1899 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1900 "bgt 1b \n" 1901 : "+r"(src_argb), // %0 1902 "+r"(src_stride_argb), // %1 1903 "+r"(dst_u), // %2 1904 "+r"(dst_v), // %3 1905 "+r"(pix) // %4 1906 : 1907 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1908 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1909 ); 1910} 1911#endif // HAS_ARGBTOUVJROW_NEON 1912 1913#ifdef HAS_BGRATOUVROW_NEON 1914void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1915 uint8* dst_u, uint8* dst_v, int pix) { 1916 asm volatile ( 1917 "add %1, %0, %1 \n" // src_stride + src_bgra 1918 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1919 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1920 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1921 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1922 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1923 "vmov.u16 q15, #0x8080 \n" // 128.5 1924 ".p2align 2 \n" 1925 "1: \n" 1926 MEMACCESS(0) 1927 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. 1928 MEMACCESS(0) 1929 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. 1930 "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. 1931 "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. 1932 "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. 1933 MEMACCESS(1) 1934 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. 1935 MEMACCESS(1) 1936 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. 1937 "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. 1938 "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. 1939 "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. 1940 1941 "vrshr.u16 q1, q1, #1 \n" // 2x average 1942 "vrshr.u16 q2, q2, #1 \n" 1943 "vrshr.u16 q3, q3, #1 \n" 1944 1945 "subs %4, %4, #16 \n" // 32 processed per loop. 1946 RGBTOUV(q3, q2, q1) 1947 MEMACCESS(2) 1948 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 1949 MEMACCESS(3) 1950 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 1951 "bgt 1b \n" 1952 : "+r"(src_bgra), // %0 1953 "+r"(src_stride_bgra), // %1 1954 "+r"(dst_u), // %2 1955 "+r"(dst_v), // %3 1956 "+r"(pix) // %4 1957 : 1958 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 1959 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 1960 ); 1961} 1962#endif // HAS_BGRATOUVROW_NEON 1963 1964#ifdef HAS_ABGRTOUVROW_NEON 1965void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1966 uint8* dst_u, uint8* dst_v, int pix) { 1967 asm volatile ( 1968 "add %1, %0, %1 \n" // src_stride + src_abgr 1969 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 1970 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 1971 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 1972 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 1973 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 1974 "vmov.u16 q15, #0x8080 \n" // 128.5 1975 ".p2align 2 \n" 1976 "1: \n" 1977 MEMACCESS(0) 1978 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. 1979 MEMACCESS(0) 1980 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. 1981 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. 1982 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1983 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. 1984 MEMACCESS(1) 1985 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. 1986 MEMACCESS(1) 1987 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. 1988 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. 1989 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. 1990 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. 1991 1992 "vrshr.u16 q0, q0, #1 \n" // 2x average 1993 "vrshr.u16 q1, q1, #1 \n" 1994 "vrshr.u16 q2, q2, #1 \n" 1995 1996 "subs %4, %4, #16 \n" // 32 processed per loop. 1997 RGBTOUV(q2, q1, q0) 1998 MEMACCESS(2) 1999 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2000 MEMACCESS(3) 2001 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2002 "bgt 1b \n" 2003 : "+r"(src_abgr), // %0 2004 "+r"(src_stride_abgr), // %1 2005 "+r"(dst_u), // %2 2006 "+r"(dst_v), // %3 2007 "+r"(pix) // %4 2008 : 2009 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2010 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2011 ); 2012} 2013#endif // HAS_ABGRTOUVROW_NEON 2014 2015#ifdef HAS_RGBATOUVROW_NEON 2016void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 2017 uint8* dst_u, uint8* dst_v, int pix) { 2018 asm volatile ( 2019 "add %1, %0, %1 \n" // src_stride + src_rgba 2020 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2021 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2022 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2023 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2024 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2025 "vmov.u16 q15, #0x8080 \n" // 128.5 2026 ".p2align 2 \n" 2027 "1: \n" 2028 MEMACCESS(0) 2029 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. 2030 MEMACCESS(0) 2031 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. 2032 "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. 2033 "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. 2034 "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. 2035 MEMACCESS(1) 2036 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. 2037 MEMACCESS(1) 2038 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. 2039 "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. 2040 "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. 2041 "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. 2042 2043 "vrshr.u16 q0, q0, #1 \n" // 2x average 2044 "vrshr.u16 q1, q1, #1 \n" 2045 "vrshr.u16 q2, q2, #1 \n" 2046 2047 "subs %4, %4, #16 \n" // 32 processed per loop. 2048 RGBTOUV(q0, q1, q2) 2049 MEMACCESS(2) 2050 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2051 MEMACCESS(3) 2052 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2053 "bgt 1b \n" 2054 : "+r"(src_rgba), // %0 2055 "+r"(src_stride_rgba), // %1 2056 "+r"(dst_u), // %2 2057 "+r"(dst_v), // %3 2058 "+r"(pix) // %4 2059 : 2060 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2061 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2062 ); 2063} 2064#endif // HAS_RGBATOUVROW_NEON 2065 2066#ifdef HAS_RGB24TOUVROW_NEON 2067void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 2068 uint8* dst_u, uint8* dst_v, int pix) { 2069 asm volatile ( 2070 "add %1, %0, %1 \n" // src_stride + src_rgb24 2071 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2072 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2073 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2074 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2075 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2076 "vmov.u16 q15, #0x8080 \n" // 128.5 2077 ".p2align 2 \n" 2078 "1: \n" 2079 MEMACCESS(0) 2080 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. 2081 MEMACCESS(0) 2082 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. 2083 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 2084 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 2085 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 2086 MEMACCESS(1) 2087 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. 2088 MEMACCESS(1) 2089 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. 2090 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. 2091 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. 2092 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. 2093 2094 "vrshr.u16 q0, q0, #1 \n" // 2x average 2095 "vrshr.u16 q1, q1, #1 \n" 2096 "vrshr.u16 q2, q2, #1 \n" 2097 2098 "subs %4, %4, #16 \n" // 32 processed per loop. 2099 RGBTOUV(q0, q1, q2) 2100 MEMACCESS(2) 2101 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2102 MEMACCESS(3) 2103 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2104 "bgt 1b \n" 2105 : "+r"(src_rgb24), // %0 2106 "+r"(src_stride_rgb24), // %1 2107 "+r"(dst_u), // %2 2108 "+r"(dst_v), // %3 2109 "+r"(pix) // %4 2110 : 2111 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2112 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2113 ); 2114} 2115#endif // HAS_RGB24TOUVROW_NEON 2116 2117#ifdef HAS_RAWTOUVROW_NEON 2118void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 2119 uint8* dst_u, uint8* dst_v, int pix) { 2120 asm volatile ( 2121 "add %1, %0, %1 \n" // src_stride + src_raw 2122 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2123 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2124 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2125 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2126 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2127 "vmov.u16 q15, #0x8080 \n" // 128.5 2128 ".p2align 2 \n" 2129 "1: \n" 2130 MEMACCESS(0) 2131 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. 2132 MEMACCESS(0) 2133 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. 2134 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. 2135 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 2136 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. 2137 MEMACCESS(1) 2138 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. 2139 MEMACCESS(1) 2140 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. 2141 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. 2142 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. 2143 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. 2144 2145 "vrshr.u16 q0, q0, #1 \n" // 2x average 2146 "vrshr.u16 q1, q1, #1 \n" 2147 "vrshr.u16 q2, q2, #1 \n" 2148 2149 "subs %4, %4, #16 \n" // 32 processed per loop. 2150 RGBTOUV(q2, q1, q0) 2151 MEMACCESS(2) 2152 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2153 MEMACCESS(3) 2154 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2155 "bgt 1b \n" 2156 : "+r"(src_raw), // %0 2157 "+r"(src_stride_raw), // %1 2158 "+r"(dst_u), // %2 2159 "+r"(dst_v), // %3 2160 "+r"(pix) // %4 2161 : 2162 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2163 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2164 ); 2165} 2166#endif // HAS_RAWTOUVROW_NEON 2167 2168// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2169#ifdef HAS_RGB565TOUVROW_NEON 2170void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 2171 uint8* dst_u, uint8* dst_v, int pix) { 2172 asm volatile ( 2173 "add %1, %0, %1 \n" // src_stride + src_argb 2174 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2175 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2176 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2177 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2178 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2179 "vmov.u16 q15, #0x8080 \n" // 128.5 2180 ".p2align 2 \n" 2181 "1: \n" 2182 MEMACCESS(0) 2183 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 2184 RGB565TOARGB 2185 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2186 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2187 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2188 MEMACCESS(0) 2189 "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. 2190 RGB565TOARGB 2191 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2192 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2193 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2194 2195 MEMACCESS(1) 2196 "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. 2197 RGB565TOARGB 2198 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2199 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2200 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2201 MEMACCESS(1) 2202 "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. 2203 RGB565TOARGB 2204 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2205 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2206 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2207 2208 "vrshr.u16 q4, q4, #1 \n" // 2x average 2209 "vrshr.u16 q5, q5, #1 \n" 2210 "vrshr.u16 q6, q6, #1 \n" 2211 2212 "subs %4, %4, #16 \n" // 16 processed per loop. 2213 "vmul.s16 q8, q4, q10 \n" // B 2214 "vmls.s16 q8, q5, q11 \n" // G 2215 "vmls.s16 q8, q6, q12 \n" // R 2216 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 2217 "vmul.s16 q9, q6, q10 \n" // R 2218 "vmls.s16 q9, q5, q14 \n" // G 2219 "vmls.s16 q9, q4, q13 \n" // B 2220 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 2221 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 2222 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 2223 MEMACCESS(2) 2224 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2225 MEMACCESS(3) 2226 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2227 "bgt 1b \n" 2228 : "+r"(src_rgb565), // %0 2229 "+r"(src_stride_rgb565), // %1 2230 "+r"(dst_u), // %2 2231 "+r"(dst_v), // %3 2232 "+r"(pix) // %4 2233 : 2234 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2235 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2236 ); 2237} 2238#endif // HAS_RGB565TOUVROW_NEON 2239 2240// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2241#ifdef HAS_ARGB1555TOUVROW_NEON 2242void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 2243 uint8* dst_u, uint8* dst_v, int pix) { 2244 asm volatile ( 2245 "add %1, %0, %1 \n" // src_stride + src_argb 2246 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2247 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2248 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2249 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2250 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2251 "vmov.u16 q15, #0x8080 \n" // 128.5 2252 ".p2align 2 \n" 2253 "1: \n" 2254 MEMACCESS(0) 2255 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 2256 RGB555TOARGB 2257 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2258 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2259 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2260 MEMACCESS(0) 2261 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. 2262 RGB555TOARGB 2263 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2264 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2265 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2266 2267 MEMACCESS(1) 2268 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. 2269 RGB555TOARGB 2270 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2271 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2272 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2273 MEMACCESS(1) 2274 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. 2275 RGB555TOARGB 2276 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2277 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2278 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2279 2280 "vrshr.u16 q4, q4, #1 \n" // 2x average 2281 "vrshr.u16 q5, q5, #1 \n" 2282 "vrshr.u16 q6, q6, #1 \n" 2283 2284 "subs %4, %4, #16 \n" // 16 processed per loop. 2285 "vmul.s16 q8, q4, q10 \n" // B 2286 "vmls.s16 q8, q5, q11 \n" // G 2287 "vmls.s16 q8, q6, q12 \n" // R 2288 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 2289 "vmul.s16 q9, q6, q10 \n" // R 2290 "vmls.s16 q9, q5, q14 \n" // G 2291 "vmls.s16 q9, q4, q13 \n" // B 2292 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 2293 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 2294 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 2295 MEMACCESS(2) 2296 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2297 MEMACCESS(3) 2298 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2299 "bgt 1b \n" 2300 : "+r"(src_argb1555), // %0 2301 "+r"(src_stride_argb1555), // %1 2302 "+r"(dst_u), // %2 2303 "+r"(dst_v), // %3 2304 "+r"(pix) // %4 2305 : 2306 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2307 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2308 ); 2309} 2310#endif // HAS_ARGB1555TOUVROW_NEON 2311 2312// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2313#ifdef HAS_ARGB4444TOUVROW_NEON 2314void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 2315 uint8* dst_u, uint8* dst_v, int pix) { 2316 asm volatile ( 2317 "add %1, %0, %1 \n" // src_stride + src_argb 2318 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient 2319 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient 2320 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient 2321 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient 2322 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient 2323 "vmov.u16 q15, #0x8080 \n" // 128.5 2324 ".p2align 2 \n" 2325 "1: \n" 2326 MEMACCESS(0) 2327 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 2328 ARGB4444TOARGB 2329 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2330 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2331 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2332 MEMACCESS(0) 2333 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. 2334 ARGB4444TOARGB 2335 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2336 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2337 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2338 2339 MEMACCESS(1) 2340 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. 2341 ARGB4444TOARGB 2342 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. 2343 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. 2344 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. 2345 MEMACCESS(1) 2346 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. 2347 ARGB4444TOARGB 2348 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. 2349 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. 2350 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. 2351 2352 "vrshr.u16 q4, q4, #1 \n" // 2x average 2353 "vrshr.u16 q5, q5, #1 \n" 2354 "vrshr.u16 q6, q6, #1 \n" 2355 2356 "subs %4, %4, #16 \n" // 16 processed per loop. 2357 "vmul.s16 q8, q4, q10 \n" // B 2358 "vmls.s16 q8, q5, q11 \n" // G 2359 "vmls.s16 q8, q6, q12 \n" // R 2360 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned 2361 "vmul.s16 q9, q6, q10 \n" // R 2362 "vmls.s16 q9, q5, q14 \n" // G 2363 "vmls.s16 q9, q4, q13 \n" // B 2364 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned 2365 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U 2366 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V 2367 MEMACCESS(2) 2368 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. 2369 MEMACCESS(3) 2370 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. 2371 "bgt 1b \n" 2372 : "+r"(src_argb4444), // %0 2373 "+r"(src_stride_argb4444), // %1 2374 "+r"(dst_u), // %2 2375 "+r"(dst_v), // %3 2376 "+r"(pix) // %4 2377 : 2378 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", 2379 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" 2380 ); 2381} 2382#endif // HAS_ARGB4444TOUVROW_NEON 2383 2384#ifdef HAS_RGB565TOYROW_NEON 2385void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { 2386 asm volatile ( 2387 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2388 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2389 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2390 "vmov.u8 d27, #16 \n" // Add 16 constant 2391 ".p2align 2 \n" 2392 "1: \n" 2393 MEMACCESS(0) 2394 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. 2395 "subs %2, %2, #8 \n" // 8 processed per loop. 2396 RGB565TOARGB 2397 "vmull.u8 q2, d0, d24 \n" // B 2398 "vmlal.u8 q2, d1, d25 \n" // G 2399 "vmlal.u8 q2, d2, d26 \n" // R 2400 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2401 "vqadd.u8 d0, d27 \n" 2402 MEMACCESS(1) 2403 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2404 "bgt 1b \n" 2405 : "+r"(src_rgb565), // %0 2406 "+r"(dst_y), // %1 2407 "+r"(pix) // %2 2408 : 2409 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2410 ); 2411} 2412#endif // HAS_RGB565TOYROW_NEON 2413 2414#ifdef HAS_ARGB1555TOYROW_NEON 2415void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { 2416 asm volatile ( 2417 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2418 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2419 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2420 "vmov.u8 d27, #16 \n" // Add 16 constant 2421 ".p2align 2 \n" 2422 "1: \n" 2423 MEMACCESS(0) 2424 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. 2425 "subs %2, %2, #8 \n" // 8 processed per loop. 2426 ARGB1555TOARGB 2427 "vmull.u8 q2, d0, d24 \n" // B 2428 "vmlal.u8 q2, d1, d25 \n" // G 2429 "vmlal.u8 q2, d2, d26 \n" // R 2430 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2431 "vqadd.u8 d0, d27 \n" 2432 MEMACCESS(1) 2433 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2434 "bgt 1b \n" 2435 : "+r"(src_argb1555), // %0 2436 "+r"(dst_y), // %1 2437 "+r"(pix) // %2 2438 : 2439 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2440 ); 2441} 2442#endif // HAS_ARGB1555TOYROW_NEON 2443 2444#ifdef HAS_ARGB4444TOYROW_NEON 2445void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { 2446 asm volatile ( 2447 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient 2448 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient 2449 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient 2450 "vmov.u8 d27, #16 \n" // Add 16 constant 2451 ".p2align 2 \n" 2452 "1: \n" 2453 MEMACCESS(0) 2454 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. 2455 "subs %2, %2, #8 \n" // 8 processed per loop. 2456 ARGB4444TOARGB 2457 "vmull.u8 q2, d0, d24 \n" // B 2458 "vmlal.u8 q2, d1, d25 \n" // G 2459 "vmlal.u8 q2, d2, d26 \n" // R 2460 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y 2461 "vqadd.u8 d0, d27 \n" 2462 MEMACCESS(1) 2463 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2464 "bgt 1b \n" 2465 : "+r"(src_argb4444), // %0 2466 "+r"(dst_y), // %1 2467 "+r"(pix) // %2 2468 : 2469 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" 2470 ); 2471} 2472#endif // HAS_ARGB4444TOYROW_NEON 2473 2474#ifdef HAS_BGRATOYROW_NEON 2475void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { 2476 asm volatile ( 2477 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2478 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2479 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2480 "vmov.u8 d7, #16 \n" // Add 16 constant 2481 ".p2align 2 \n" 2482 "1: \n" 2483 MEMACCESS(0) 2484 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. 2485 "subs %2, %2, #8 \n" // 8 processed per loop. 2486 "vmull.u8 q8, d1, d4 \n" // R 2487 "vmlal.u8 q8, d2, d5 \n" // G 2488 "vmlal.u8 q8, d3, d6 \n" // B 2489 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2490 "vqadd.u8 d0, d7 \n" 2491 MEMACCESS(1) 2492 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2493 "bgt 1b \n" 2494 : "+r"(src_bgra), // %0 2495 "+r"(dst_y), // %1 2496 "+r"(pix) // %2 2497 : 2498 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2499 ); 2500} 2501#endif // HAS_BGRATOYROW_NEON 2502 2503#ifdef HAS_ABGRTOYROW_NEON 2504void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { 2505 asm volatile ( 2506 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2507 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2508 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2509 "vmov.u8 d7, #16 \n" // Add 16 constant 2510 ".p2align 2 \n" 2511 "1: \n" 2512 MEMACCESS(0) 2513 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. 2514 "subs %2, %2, #8 \n" // 8 processed per loop. 2515 "vmull.u8 q8, d0, d4 \n" // R 2516 "vmlal.u8 q8, d1, d5 \n" // G 2517 "vmlal.u8 q8, d2, d6 \n" // B 2518 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2519 "vqadd.u8 d0, d7 \n" 2520 MEMACCESS(1) 2521 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2522 "bgt 1b \n" 2523 : "+r"(src_abgr), // %0 2524 "+r"(dst_y), // %1 2525 "+r"(pix) // %2 2526 : 2527 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2528 ); 2529} 2530#endif // HAS_ABGRTOYROW_NEON 2531 2532#ifdef HAS_RGBATOYROW_NEON 2533void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { 2534 asm volatile ( 2535 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient 2536 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2537 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 2538 "vmov.u8 d7, #16 \n" // Add 16 constant 2539 ".p2align 2 \n" 2540 "1: \n" 2541 MEMACCESS(0) 2542 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. 2543 "subs %2, %2, #8 \n" // 8 processed per loop. 2544 "vmull.u8 q8, d1, d4 \n" // B 2545 "vmlal.u8 q8, d2, d5 \n" // G 2546 "vmlal.u8 q8, d3, d6 \n" // R 2547 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2548 "vqadd.u8 d0, d7 \n" 2549 MEMACCESS(1) 2550 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2551 "bgt 1b \n" 2552 : "+r"(src_rgba), // %0 2553 "+r"(dst_y), // %1 2554 "+r"(pix) // %2 2555 : 2556 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2557 ); 2558} 2559#endif // HAS_RGBATOYROW_NEON 2560 2561#ifdef HAS_RGB24TOYROW_NEON 2562void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { 2563 asm volatile ( 2564 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient 2565 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2566 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient 2567 "vmov.u8 d7, #16 \n" // Add 16 constant 2568 ".p2align 2 \n" 2569 "1: \n" 2570 MEMACCESS(0) 2571 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. 2572 "subs %2, %2, #8 \n" // 8 processed per loop. 2573 "vmull.u8 q8, d0, d4 \n" // B 2574 "vmlal.u8 q8, d1, d5 \n" // G 2575 "vmlal.u8 q8, d2, d6 \n" // R 2576 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2577 "vqadd.u8 d0, d7 \n" 2578 MEMACCESS(1) 2579 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2580 "bgt 1b \n" 2581 : "+r"(src_rgb24), // %0 2582 "+r"(dst_y), // %1 2583 "+r"(pix) // %2 2584 : 2585 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2586 ); 2587} 2588#endif // HAS_RGB24TOYROW_NEON 2589 2590#ifdef HAS_RAWTOYROW_NEON 2591void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { 2592 asm volatile ( 2593 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient 2594 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient 2595 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient 2596 "vmov.u8 d7, #16 \n" // Add 16 constant 2597 ".p2align 2 \n" 2598 "1: \n" 2599 MEMACCESS(0) 2600 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. 2601 "subs %2, %2, #8 \n" // 8 processed per loop. 2602 "vmull.u8 q8, d0, d4 \n" // B 2603 "vmlal.u8 q8, d1, d5 \n" // G 2604 "vmlal.u8 q8, d2, d6 \n" // R 2605 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y 2606 "vqadd.u8 d0, d7 \n" 2607 MEMACCESS(1) 2608 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. 2609 "bgt 1b \n" 2610 : "+r"(src_raw), // %0 2611 "+r"(dst_y), // %1 2612 "+r"(pix) // %2 2613 : 2614 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" 2615 ); 2616} 2617#endif // HAS_RAWTOYROW_NEON 2618 2619// Bilinear filter 16x2 -> 16x1 2620#ifdef HAS_INTERPOLATEROW_NEON 2621void InterpolateRow_NEON(uint8* dst_ptr, 2622 const uint8* src_ptr, ptrdiff_t src_stride, 2623 int dst_width, int source_y_fraction) { 2624 asm volatile ( 2625 "cmp %4, #0 \n" 2626 "beq 100f \n" 2627 "add %2, %1 \n" 2628 "cmp %4, #64 \n" 2629 "beq 75f \n" 2630 "cmp %4, #128 \n" 2631 "beq 50f \n" 2632 "cmp %4, #192 \n" 2633 "beq 25f \n" 2634 2635 "vdup.8 d5, %4 \n" 2636 "rsb %4, #256 \n" 2637 "vdup.8 d4, %4 \n" 2638 // General purpose row blend. 2639 "1: \n" 2640 MEMACCESS(1) 2641 "vld1.8 {q0}, [%1]! \n" 2642 MEMACCESS(2) 2643 "vld1.8 {q1}, [%2]! \n" 2644 "subs %3, %3, #16 \n" 2645 "vmull.u8 q13, d0, d4 \n" 2646 "vmull.u8 q14, d1, d4 \n" 2647 "vmlal.u8 q13, d2, d5 \n" 2648 "vmlal.u8 q14, d3, d5 \n" 2649 "vrshrn.u16 d0, q13, #8 \n" 2650 "vrshrn.u16 d1, q14, #8 \n" 2651 MEMACCESS(0) 2652 "vst1.8 {q0}, [%0]! \n" 2653 "bgt 1b \n" 2654 "b 99f \n" 2655 2656 // Blend 25 / 75. 2657 "25: \n" 2658 MEMACCESS(1) 2659 "vld1.8 {q0}, [%1]! \n" 2660 MEMACCESS(2) 2661 "vld1.8 {q1}, [%2]! \n" 2662 "subs %3, %3, #16 \n" 2663 "vrhadd.u8 q0, q1 \n" 2664 "vrhadd.u8 q0, q1 \n" 2665 MEMACCESS(0) 2666 "vst1.8 {q0}, [%0]! \n" 2667 "bgt 25b \n" 2668 "b 99f \n" 2669 2670 // Blend 50 / 50. 2671 "50: \n" 2672 MEMACCESS(1) 2673 "vld1.8 {q0}, [%1]! \n" 2674 MEMACCESS(2) 2675 "vld1.8 {q1}, [%2]! \n" 2676 "subs %3, %3, #16 \n" 2677 "vrhadd.u8 q0, q1 \n" 2678 MEMACCESS(0) 2679 "vst1.8 {q0}, [%0]! \n" 2680 "bgt 50b \n" 2681 "b 99f \n" 2682 2683 // Blend 75 / 25. 2684 "75: \n" 2685 MEMACCESS(1) 2686 "vld1.8 {q1}, [%1]! \n" 2687 MEMACCESS(2) 2688 "vld1.8 {q0}, [%2]! \n" 2689 "subs %3, %3, #16 \n" 2690 "vrhadd.u8 q0, q1 \n" 2691 "vrhadd.u8 q0, q1 \n" 2692 MEMACCESS(0) 2693 "vst1.8 {q0}, [%0]! \n" 2694 "bgt 75b \n" 2695 "b 99f \n" 2696 2697 // Blend 100 / 0 - Copy row unchanged. 2698 "100: \n" 2699 MEMACCESS(1) 2700 "vld1.8 {q0}, [%1]! \n" 2701 "subs %3, %3, #16 \n" 2702 MEMACCESS(0) 2703 "vst1.8 {q0}, [%0]! \n" 2704 "bgt 100b \n" 2705 2706 "99: \n" 2707 : "+r"(dst_ptr), // %0 2708 "+r"(src_ptr), // %1 2709 "+r"(src_stride), // %2 2710 "+r"(dst_width), // %3 2711 "+r"(source_y_fraction) // %4 2712 : 2713 : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" 2714 ); 2715} 2716#endif // HAS_INTERPOLATEROW_NEON 2717 2718// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2719#ifdef HAS_ARGBBLENDROW_NEON 2720void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2721 uint8* dst_argb, int width) { 2722 asm volatile ( 2723 "subs %3, #8 \n" 2724 "blt 89f \n" 2725 // Blend 8 pixels. 2726 "8: \n" 2727 MEMACCESS(0) 2728 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. 2729 MEMACCESS(1) 2730 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. 2731 "subs %3, %3, #8 \n" // 8 processed per loop. 2732 "vmull.u8 q10, d4, d3 \n" // db * a 2733 "vmull.u8 q11, d5, d3 \n" // dg * a 2734 "vmull.u8 q12, d6, d3 \n" // dr * a 2735 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 2736 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 2737 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 2738 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 2739 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 2740 "vqadd.u8 q0, q0, q2 \n" // + sbg 2741 "vqadd.u8 d2, d2, d6 \n" // + sr 2742 "vmov.u8 d3, #255 \n" // a = 255 2743 MEMACCESS(2) 2744 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. 2745 "bge 8b \n" 2746 2747 "89: \n" 2748 "adds %3, #8-1 \n" 2749 "blt 99f \n" 2750 2751 // Blend 1 pixels. 2752 "1: \n" 2753 MEMACCESS(0) 2754 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. 2755 MEMACCESS(1) 2756 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. 2757 "subs %3, %3, #1 \n" // 1 processed per loop. 2758 "vmull.u8 q10, d4, d3 \n" // db * a 2759 "vmull.u8 q11, d5, d3 \n" // dg * a 2760 "vmull.u8 q12, d6, d3 \n" // dr * a 2761 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 2762 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 2763 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 2764 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 2765 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 2766 "vqadd.u8 q0, q0, q2 \n" // + sbg 2767 "vqadd.u8 d2, d2, d6 \n" // + sr 2768 "vmov.u8 d3, #255 \n" // a = 255 2769 MEMACCESS(2) 2770 "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. 2771 "bge 1b \n" 2772 2773 "99: \n" 2774 2775 : "+r"(src_argb0), // %0 2776 "+r"(src_argb1), // %1 2777 "+r"(dst_argb), // %2 2778 "+r"(width) // %3 2779 : 2780 : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" 2781 ); 2782} 2783#endif // HAS_ARGBBLENDROW_NEON 2784 2785// Attenuate 8 pixels at a time. 2786#ifdef HAS_ARGBATTENUATEROW_NEON 2787void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2788 asm volatile ( 2789 // Attenuate 8 pixels. 2790 "1: \n" 2791 MEMACCESS(0) 2792 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. 2793 "subs %2, %2, #8 \n" // 8 processed per loop. 2794 "vmull.u8 q10, d0, d3 \n" // b * a 2795 "vmull.u8 q11, d1, d3 \n" // g * a 2796 "vmull.u8 q12, d2, d3 \n" // r * a 2797 "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 2798 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 2799 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 2800 MEMACCESS(1) 2801 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. 2802 "bgt 1b \n" 2803 : "+r"(src_argb), // %0 2804 "+r"(dst_argb), // %1 2805 "+r"(width) // %2 2806 : 2807 : "cc", "memory", "q0", "q1", "q10", "q11", "q12" 2808 ); 2809} 2810#endif // HAS_ARGBATTENUATEROW_NEON 2811 2812// Quantize 8 ARGB pixels (32 bytes). 2813// dst = (dst * scale >> 16) * interval_size + interval_offset; 2814#ifdef HAS_ARGBQUANTIZEROW_NEON 2815void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2816 int interval_offset, int width) { 2817 asm volatile ( 2818 "vdup.u16 q8, %2 \n" 2819 "vshr.u16 q8, q8, #1 \n" // scale >>= 1 2820 "vdup.u16 q9, %3 \n" // interval multiply. 2821 "vdup.u16 q10, %4 \n" // interval add 2822 2823 // 8 pixel loop. 2824 ".p2align 2 \n" 2825 "1: \n" 2826 MEMACCESS(0) 2827 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. 2828 "subs %1, %1, #8 \n" // 8 processed per loop. 2829 "vmovl.u8 q0, d0 \n" // b (0 .. 255) 2830 "vmovl.u8 q1, d2 \n" 2831 "vmovl.u8 q2, d4 \n" 2832 "vqdmulh.s16 q0, q0, q8 \n" // b * scale 2833 "vqdmulh.s16 q1, q1, q8 \n" // g 2834 "vqdmulh.s16 q2, q2, q8 \n" // r 2835 "vmul.u16 q0, q0, q9 \n" // b * interval_size 2836 "vmul.u16 q1, q1, q9 \n" // g 2837 "vmul.u16 q2, q2, q9 \n" // r 2838 "vadd.u16 q0, q0, q10 \n" // b + interval_offset 2839 "vadd.u16 q1, q1, q10 \n" // g 2840 "vadd.u16 q2, q2, q10 \n" // r 2841 "vqmovn.u16 d0, q0 \n" 2842 "vqmovn.u16 d2, q1 \n" 2843 "vqmovn.u16 d4, q2 \n" 2844 MEMACCESS(0) 2845 "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. 2846 "bgt 1b \n" 2847 : "+r"(dst_argb), // %0 2848 "+r"(width) // %1 2849 : "r"(scale), // %2 2850 "r"(interval_size), // %3 2851 "r"(interval_offset) // %4 2852 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" 2853 ); 2854} 2855#endif // HAS_ARGBQUANTIZEROW_NEON 2856 2857// Shade 8 pixels at a time by specified value. 2858// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2859// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2860#ifdef HAS_ARGBSHADEROW_NEON 2861void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2862 uint32 value) { 2863 asm volatile ( 2864 "vdup.u32 q0, %3 \n" // duplicate scale value. 2865 "vzip.u8 d0, d1 \n" // d0 aarrggbb. 2866 "vshr.u16 q0, q0, #1 \n" // scale / 2. 2867 2868 // 8 pixel loop. 2869 ".p2align 2 \n" 2870 "1: \n" 2871 MEMACCESS(0) 2872 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. 2873 "subs %2, %2, #8 \n" // 8 processed per loop. 2874 "vmovl.u8 q10, d20 \n" // b (0 .. 255) 2875 "vmovl.u8 q11, d22 \n" 2876 "vmovl.u8 q12, d24 \n" 2877 "vmovl.u8 q13, d26 \n" 2878 "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 2879 "vqrdmulh.s16 q11, q11, d0[1] \n" // g 2880 "vqrdmulh.s16 q12, q12, d0[2] \n" // r 2881 "vqrdmulh.s16 q13, q13, d0[3] \n" // a 2882 "vqmovn.u16 d20, q10 \n" 2883 "vqmovn.u16 d22, q11 \n" 2884 "vqmovn.u16 d24, q12 \n" 2885 "vqmovn.u16 d26, q13 \n" 2886 MEMACCESS(1) 2887 "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. 2888 "bgt 1b \n" 2889 : "+r"(src_argb), // %0 2890 "+r"(dst_argb), // %1 2891 "+r"(width) // %2 2892 : "r"(value) // %3 2893 : "cc", "memory", "q0", "q10", "q11", "q12", "q13" 2894 ); 2895} 2896#endif // HAS_ARGBSHADEROW_NEON 2897 2898// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2899// Similar to ARGBToYJ but stores ARGB. 2900// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2901#ifdef HAS_ARGBGRAYROW_NEON 2902void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2903 asm volatile ( 2904 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient 2905 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient 2906 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient 2907 ".p2align 2 \n" 2908 "1: \n" 2909 MEMACCESS(0) 2910 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. 2911 "subs %2, %2, #8 \n" // 8 processed per loop. 2912 "vmull.u8 q2, d0, d24 \n" // B 2913 "vmlal.u8 q2, d1, d25 \n" // G 2914 "vmlal.u8 q2, d2, d26 \n" // R 2915 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B 2916 "vmov d1, d0 \n" // G 2917 "vmov d2, d0 \n" // R 2918 MEMACCESS(1) 2919 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. 2920 "bgt 1b \n" 2921 : "+r"(src_argb), // %0 2922 "+r"(dst_argb), // %1 2923 "+r"(width) // %2 2924 : 2925 : "cc", "memory", "q0", "q1", "q2", "q12", "q13" 2926 ); 2927} 2928#endif // HAS_ARGBGRAYROW_NEON 2929 2930// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2931// b = (r * 35 + g * 68 + b * 17) >> 7 2932// g = (r * 45 + g * 88 + b * 22) >> 7 2933// r = (r * 50 + g * 98 + b * 24) >> 7 2934 2935#ifdef HAS_ARGBSEPIAROW_NEON 2936void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2937 asm volatile ( 2938 "vmov.u8 d20, #17 \n" // BB coefficient 2939 "vmov.u8 d21, #68 \n" // BG coefficient 2940 "vmov.u8 d22, #35 \n" // BR coefficient 2941 "vmov.u8 d24, #22 \n" // GB coefficient 2942 "vmov.u8 d25, #88 \n" // GG coefficient 2943 "vmov.u8 d26, #45 \n" // GR coefficient 2944 "vmov.u8 d28, #24 \n" // BB coefficient 2945 "vmov.u8 d29, #98 \n" // BG coefficient 2946 "vmov.u8 d30, #50 \n" // BR coefficient 2947 ".p2align 2 \n" 2948 "1: \n" 2949 MEMACCESS(0) 2950 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. 2951 "subs %1, %1, #8 \n" // 8 processed per loop. 2952 "vmull.u8 q2, d0, d20 \n" // B to Sepia B 2953 "vmlal.u8 q2, d1, d21 \n" // G 2954 "vmlal.u8 q2, d2, d22 \n" // R 2955 "vmull.u8 q3, d0, d24 \n" // B to Sepia G 2956 "vmlal.u8 q3, d1, d25 \n" // G 2957 "vmlal.u8 q3, d2, d26 \n" // R 2958 "vmull.u8 q8, d0, d28 \n" // B to Sepia R 2959 "vmlal.u8 q8, d1, d29 \n" // G 2960 "vmlal.u8 q8, d2, d30 \n" // R 2961 "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B 2962 "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G 2963 "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R 2964 MEMACCESS(0) 2965 "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. 2966 "bgt 1b \n" 2967 : "+r"(dst_argb), // %0 2968 "+r"(width) // %1 2969 : 2970 : "cc", "memory", "q0", "q1", "q2", "q3", 2971 "q10", "q11", "q12", "q13", "q14", "q15" 2972 ); 2973} 2974#endif // HAS_ARGBSEPIAROW_NEON 2975 2976// Tranform 8 ARGB pixels (32 bytes) with color matrix. 2977// TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2978// needs to saturate. Consider doing a non-saturating version. 2979#ifdef HAS_ARGBCOLORMATRIXROW_NEON 2980void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, 2981 const int8* matrix_argb, int width) { 2982 asm volatile ( 2983 MEMACCESS(3) 2984 "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. 2985 "vmovl.s8 q0, d4 \n" // B,G coefficients s16. 2986 "vmovl.s8 q1, d5 \n" // R,A coefficients s16. 2987 2988 ".p2align 2 \n" 2989 "1: \n" 2990 MEMACCESS(0) 2991 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. 2992 "subs %2, %2, #8 \n" // 8 processed per loop. 2993 "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit 2994 "vmovl.u8 q9, d18 \n" // g 2995 "vmovl.u8 q10, d20 \n" // r 2996 "vmovl.u8 q15, d22 \n" // a 2997 "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B 2998 "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G 2999 "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R 3000 "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A 3001 "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B 3002 "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G 3003 "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R 3004 "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A 3005 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 3006 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 3007 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 3008 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 3009 "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B 3010 "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G 3011 "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R 3012 "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A 3013 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 3014 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 3015 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 3016 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 3017 "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B 3018 "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G 3019 "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R 3020 "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A 3021 "vqadd.s16 q12, q12, q4 \n" // Accumulate B 3022 "vqadd.s16 q13, q13, q5 \n" // Accumulate G 3023 "vqadd.s16 q14, q14, q6 \n" // Accumulate R 3024 "vqadd.s16 q15, q15, q7 \n" // Accumulate A 3025 "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B 3026 "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G 3027 "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R 3028 "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A 3029 MEMACCESS(1) 3030 "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. 3031 "bgt 1b \n" 3032 : "+r"(src_argb), // %0 3033 "+r"(dst_argb), // %1 3034 "+r"(width) // %2 3035 : "r"(matrix_argb) // %3 3036 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 3037 "q10", "q11", "q12", "q13", "q14", "q15" 3038 ); 3039} 3040#endif // HAS_ARGBCOLORMATRIXROW_NEON 3041 3042// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 3043// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 3044#ifdef HAS_ARGBMULTIPLYROW_NEON 3045void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3046 uint8* dst_argb, int width) { 3047 asm volatile ( 3048 // 8 pixel loop. 3049 ".p2align 2 \n" 3050 "1: \n" 3051 MEMACCESS(0) 3052 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 3053 MEMACCESS(1) 3054 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 3055 "subs %3, %3, #8 \n" // 8 processed per loop. 3056 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 3057 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 3058 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 3059 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 3060 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 3061 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 3062 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 3063 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 3064 MEMACCESS(2) 3065 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 3066 "bgt 1b \n" 3067 3068 : "+r"(src_argb0), // %0 3069 "+r"(src_argb1), // %1 3070 "+r"(dst_argb), // %2 3071 "+r"(width) // %3 3072 : 3073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 3074 ); 3075} 3076#endif // HAS_ARGBMULTIPLYROW_NEON 3077 3078// Add 2 rows of ARGB pixels together, 8 pixels at a time. 3079#ifdef HAS_ARGBADDROW_NEON 3080void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3081 uint8* dst_argb, int width) { 3082 asm volatile ( 3083 // 8 pixel loop. 3084 ".p2align 2 \n" 3085 "1: \n" 3086 MEMACCESS(0) 3087 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 3088 MEMACCESS(1) 3089 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 3090 "subs %3, %3, #8 \n" // 8 processed per loop. 3091 "uqadd v0.8b, v0.8b, v4.8b \n" 3092 "uqadd v1.8b, v1.8b, v5.8b \n" 3093 "uqadd v2.8b, v2.8b, v6.8b \n" 3094 "uqadd v3.8b, v3.8b, v7.8b \n" 3095 MEMACCESS(2) 3096 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 3097 "bgt 1b \n" 3098 3099 : "+r"(src_argb0), // %0 3100 "+r"(src_argb1), // %1 3101 "+r"(dst_argb), // %2 3102 "+r"(width) // %3 3103 : 3104 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 3105 ); 3106} 3107#endif // HAS_ARGBADDROW_NEON 3108 3109// Subtract 2 rows of ARGB pixels, 8 pixels at a time. 3110#ifdef HAS_ARGBSUBTRACTROW_NEON 3111void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 3112 uint8* dst_argb, int width) { 3113 asm volatile ( 3114 // 8 pixel loop. 3115 ".p2align 2 \n" 3116 "1: \n" 3117 MEMACCESS(0) 3118 "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 3119 MEMACCESS(1) 3120 "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. 3121 "subs %3, %3, #8 \n" // 8 processed per loop. 3122 "uqsub v0.8b, v0.8b, v4.8b \n" 3123 "uqsub v1.8b, v1.8b, v5.8b \n" 3124 "uqsub v2.8b, v2.8b, v6.8b \n" 3125 "uqsub v3.8b, v3.8b, v7.8b \n" 3126 MEMACCESS(2) 3127 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 3128 "bgt 1b \n" 3129 3130 : "+r"(src_argb0), // %0 3131 "+r"(src_argb1), // %1 3132 "+r"(dst_argb), // %2 3133 "+r"(width) // %3 3134 : 3135 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 3136 ); 3137} 3138#endif // HAS_ARGBSUBTRACTROW_NEON 3139 3140// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 3141// A = 255 3142// R = Sobel 3143// G = Sobel 3144// B = Sobel 3145#ifdef HAS_SOBELROW_NEON 3146void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3147 uint8* dst_argb, int width) { 3148 asm volatile ( 3149 "movi v3.8b, #255 \n" // alpha 3150 // 8 pixel loop. 3151 ".p2align 2 \n" 3152 "1: \n" 3153 MEMACCESS(0) 3154 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 3155 MEMACCESS(1) 3156 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 3157 "subs %3, %3, #8 \n" // 8 processed per loop. 3158 "uqadd v0.8b, v0.8b, v1.8b \n" // add 3159 "mov v1.8b, v0.8b \n" 3160 "mov v2.8b, v0.8b \n" 3161 MEMACCESS(2) 3162 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 3163 "bgt 1b \n" 3164 : "+r"(src_sobelx), // %0 3165 "+r"(src_sobely), // %1 3166 "+r"(dst_argb), // %2 3167 "+r"(width) // %3 3168 : 3169 : "cc", "memory", "v0", "v1", "v2", "v3" 3170 ); 3171} 3172#endif // HAS_SOBELROW_NEON 3173 3174// Adds Sobel X and Sobel Y and stores Sobel into plane. 3175#ifdef HAS_SOBELTOPLANEROW_NEON 3176void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3177 uint8* dst_y, int width) { 3178 asm volatile ( 3179 // 16 pixel loop. 3180 ".p2align 2 \n" 3181 "1: \n" 3182 MEMACCESS(0) 3183 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 3184 MEMACCESS(1) 3185 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 3186 "subs %3, %3, #16 \n" // 16 processed per loop. 3187 "uqadd v0.16b, v0.16b, v1.16b \n" // add 3188 MEMACCESS(2) 3189 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 3190 "bgt 1b \n" 3191 : "+r"(src_sobelx), // %0 3192 "+r"(src_sobely), // %1 3193 "+r"(dst_y), // %2 3194 "+r"(width) // %3 3195 : 3196 : "cc", "memory", "v0", "v1" 3197 ); 3198} 3199#endif // HAS_SOBELTOPLANEROW_NEON 3200 3201// Mixes Sobel X, Sobel Y and Sobel into ARGB. 3202// A = 255 3203// R = Sobel X 3204// G = Sobel 3205// B = Sobel Y 3206#ifdef HAS_SOBELXYROW_NEON 3207void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 3208 uint8* dst_argb, int width) { 3209 asm volatile ( 3210 "movi v3.8b, #255 \n" // alpha 3211 // 8 pixel loop. 3212 ".p2align 2 \n" 3213 "1: \n" 3214 MEMACCESS(0) 3215 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 3216 MEMACCESS(1) 3217 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 3218 "subs %3, %3, #8 \n" // 8 processed per loop. 3219 "uqadd v1.8b, v0.8b, v2.8b \n" // add 3220 MEMACCESS(2) 3221 "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. 3222 "bgt 1b \n" 3223 : "+r"(src_sobelx), // %0 3224 "+r"(src_sobely), // %1 3225 "+r"(dst_argb), // %2 3226 "+r"(width) // %3 3227 : 3228 : "cc", "memory", "v0", "v1", "v2", "v3" 3229 ); 3230} 3231#endif // HAS_SOBELXYROW_NEON 3232 3233// SobelX as a matrix is 3234// -1 0 1 3235// -2 0 2 3236// -1 0 1 3237#ifdef HAS_SOBELXROW_NEON 3238void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 3239 const uint8* src_y2, uint8* dst_sobelx, int width) { 3240 asm volatile ( 3241 ".p2align 2 \n" 3242 "1: \n" 3243 MEMACCESS(0) 3244 "ld1 {v0.8b}, [%0],%5 \n" // top 3245 MEMACCESS(0) 3246 "ld1 {v1.8b}, [%0],%6 \n" 3247 "usubl v0.8h, v0.8b, v1.8b \n" 3248 MEMACCESS(1) 3249 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 3250 MEMACCESS(1) 3251 "ld1 {v3.8b}, [%1],%6 \n" 3252 "usubl v1.8h, v2.8b, v3.8b \n" 3253 "add v0.8h, v0.8h, v1.8h \n" 3254 "add v0.8h, v0.8h, v1.8h \n" 3255 MEMACCESS(2) 3256 "ld1 {v2.8b}, [%2],%5 \n" // bottom 3257 MEMACCESS(2) 3258 "ld1 {v3.8b}, [%2],%6 \n" 3259 "subs %4, %4, #8 \n" // 8 pixels 3260 "usubl v1.8h, v2.8b, v3.8b \n" 3261 "add v0.8h, v0.8h, v1.8h \n" 3262 "abs v0.8h, v0.8h \n" 3263 "uqxtn v0.8b, v0.8h \n" 3264 MEMACCESS(3) 3265 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 3266 "bgt 1b \n" 3267 : "+r"(src_y0), // %0 3268 "+r"(src_y1), // %1 3269 "+r"(src_y2), // %2 3270 "+r"(dst_sobelx), // %3 3271 "+r"(width) // %4 3272 : "r"(2), // %5 3273 "r"(6) // %6 3274 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3275 ); 3276} 3277#endif // HAS_SOBELXROW_NEON 3278 3279// SobelY as a matrix is 3280// -1 -2 -1 3281// 0 0 0 3282// 1 2 1 3283#ifdef HAS_SOBELYROW_NEON 3284void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 3285 uint8* dst_sobely, int width) { 3286 asm volatile ( 3287 ".p2align 2 \n" 3288 "1: \n" 3289 MEMACCESS(0) 3290 "ld1 {v0.8b}, [%0],%4 \n" // left 3291 MEMACCESS(1) 3292 "ld1 {v1.8b}, [%1],%4 \n" 3293 "usubl v0.8h, v0.8b, v1.8b \n" 3294 MEMACCESS(0) 3295 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 3296 MEMACCESS(1) 3297 "ld1 {v3.8b}, [%1],%4 \n" 3298 "usubl v1.8h, v2.8b, v3.8b \n" 3299 "add v0.8h, v0.8h, v1.8h \n" 3300 "add v0.8h, v0.8h, v1.8h \n" 3301 MEMACCESS(0) 3302 "ld1 {v2.8b}, [%0],%5 \n" // right 3303 MEMACCESS(1) 3304 "ld1 {v3.8b}, [%1],%5 \n" 3305 "subs %3, %3, #8 \n" // 8 pixels 3306 "usubl v1.8h, v2.8b, v3.8b \n" 3307 "add v0.8h, v0.8h, v1.8h \n" 3308 "abs v0.8h, v0.8h \n" 3309 "uqxtn v0.8b, v0.8h \n" 3310 MEMACCESS(2) 3311 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 3312 "bgt 1b \n" 3313 : "+r"(src_y0), // %0 3314 "+r"(src_y1), // %1 3315 "+r"(dst_sobely), // %2 3316 "+r"(width) // %3 3317 : "r"(1), // %4 3318 "r"(6) // %5 3319 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3320 ); 3321} 3322#endif // HAS_SOBELYROW_NEON 3323#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3324 3325#ifdef __cplusplus 3326} // extern "C" 3327} // namespace libyuv 3328#endif 3329