1/* 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC Neon armv8 64 bit. 19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 20 21// Read 8 Y, 4 U and 4 V from 422 22#define READYUV422 \ 23 MEMACCESS(0) \ 24 "ld1 {v0.8b}, [%0], #8 \n" \ 25 MEMACCESS(1) \ 26 "ld1 {v1.s}[0], [%1], #4 \n" \ 27 MEMACCESS(2) \ 28 "ld1 {v1.s}[1], [%2], #4 \n" 29 30// Read 8 Y, 2 U and 2 V from 422 31#define READYUV411 \ 32 MEMACCESS(0) \ 33 "ld1 {v0.8b}, [%0], #8 \n" \ 34 MEMACCESS(1) \ 35 "ld1 {v2.h}[0], [%1], #2 \n" \ 36 MEMACCESS(2) \ 37 "ld1 {v2.h}[1], [%2], #2 \n" \ 38 "zip1 v1.8b, v2.8b, v2.8b \n" 39 40// Read 8 Y, 8 U and 8 V from 444 41#define READYUV444 \ 42 MEMACCESS(0) \ 43 "ld1 {v0.8b}, [%0], #8 \n" \ 44 MEMACCESS(1) \ 45 "ld1 {v1.d}[0], [%1], #8 \n" \ 46 MEMACCESS(2) \ 47 "ld1 {v1.d}[1], [%2], #8 \n" \ 48 "uaddlp v1.8h, v1.16b \n" \ 49 "rshrn v1.8b, v1.8h, #1 \n" 50 51// Read 8 Y, and set 4 U and 4 V to 128 52#define READYUV400 \ 53 MEMACCESS(0) \ 54 "ld1 {v0.8b}, [%0], #8 \n" \ 55 "movi v1.8b , #128 \n" 56 57// Read 8 Y and 4 UV from NV12 58#define READNV12 \ 59 MEMACCESS(0) \ 60 "ld1 {v0.8b}, [%0], #8 \n" \ 61 MEMACCESS(1) \ 62 "ld1 {v2.8b}, [%1], #8 \n" \ 63 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 64 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 65 "ins v1.s[1], v3.s[0] \n" 66 67// Read 8 Y and 4 VU from NV21 68#define READNV21 \ 69 MEMACCESS(0) \ 70 "ld1 {v0.8b}, [%0], #8 \n" \ 71 MEMACCESS(1) \ 72 "ld1 {v2.8b}, [%1], #8 \n" \ 73 "uzp1 v3.8b, v2.8b, v2.8b \n" \ 74 "uzp2 v1.8b, v2.8b, v2.8b \n" \ 75 "ins v1.s[1], v3.s[0] \n" 76 77// Read 8 YUY2 78#define READYUY2 \ 79 MEMACCESS(0) \ 80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ 81 "uzp2 v3.8b, v1.8b, v1.8b \n" \ 82 "uzp1 v1.8b, v1.8b, v1.8b \n" \ 83 "ins v1.s[1], v3.s[0] \n" 84 85// Read 8 UYVY 86#define READUYVY \ 87 MEMACCESS(0) \ 88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ 89 "orr v0.8b, v3.8b, v3.8b \n" \ 90 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 91 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 92 "ins v1.s[1], v3.s[0] \n" 93 94#define YUV422TORGB_SETUP_REG \ 95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ 96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ 97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ 98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \ 99 "movi v27.8h, #128 \n" \ 100 "movi v28.8h, #102 \n" \ 101 "movi v29.8h, #25 \n" \ 102 "movi v30.8h, #52 \n" 103 104#define YUV422TORGB(vR, vG, vB) \ 105 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ 106 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ 107 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ 108 "ushll v0.4s, v0.4h, #0 \n" \ 109 "mul v3.4s, v3.4s, v31.4s \n" \ 110 "mul v0.4s, v0.4s, v31.4s \n" \ 111 "sqshrun v0.4h, v0.4s, #16 \n" \ 112 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ 113 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ 114 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ 115 "uxtl v2.8h, v2.8b \n" \ 116 "uxtl v1.8h, v1.8b \n" /* Extract U */ \ 117 "mul v3.8h, v1.8h, v27.8h \n" \ 118 "mul v5.8h, v1.8h, v29.8h \n" \ 119 "mul v6.8h, v2.8h, v30.8h \n" \ 120 "mul v7.8h, v2.8h, v28.8h \n" \ 121 "sqadd v6.8h, v6.8h, v5.8h \n" \ 122 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ 123 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ 124 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ 125 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ 126 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ 127 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ 128 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ 129 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ 130 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ 131 132// YUV to RGB conversion constants. 133// Y contribution to R,G,B. Scale and bias. 134#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ 135#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ 136 137// U and V contributions to R,G,B. 138#define UB -128 /* -min(128, round(2.018 * 64)) */ 139#define UG 25 /* -round(-0.391 * 64) */ 140#define VG 52 /* -round(-0.813 * 64) */ 141#define VR -102 /* -round(1.596 * 64) */ 142 143// Bias values to subtract 16 from Y and 128 from U and V. 144#define BB (UB * 128 - YGB) 145#define BG (UG * 128 + VG * 128 - YGB) 146#define BR (VR * 128 - YGB) 147 148static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; 149static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; 150 151#undef YG 152#undef YGB 153#undef UB 154#undef UG 155#undef VG 156#undef VR 157#undef BB 158#undef BG 159#undef BR 160 161#define RGBTOUV_SETUP_REG \ 162 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 163 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 164 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 165 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 166 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 167 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 168 169 170#ifdef HAS_I444TOARGBROW_NEON 171void I444ToARGBRow_NEON(const uint8* src_y, 172 const uint8* src_u, 173 const uint8* src_v, 174 uint8* dst_argb, 175 int width) { 176 asm volatile ( 177 YUV422TORGB_SETUP_REG 178 "1: \n" 179 READYUV444 180 YUV422TORGB(v22, v21, v20) 181 "subs %w4, %w4, #8 \n" 182 "movi v23.8b, #255 \n" /* A */ 183 MEMACCESS(3) 184 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 185 "b.gt 1b \n" 186 : "+r"(src_y), // %0 187 "+r"(src_u), // %1 188 "+r"(src_v), // %2 189 "+r"(dst_argb), // %3 190 "+r"(width) // %4 191 : [kUVBiasBGR]"r"(&kUVBiasBGR), 192 [kYToRgb]"r"(&kYToRgb) 193 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 194 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 195 ); 196} 197#endif // HAS_I444TOARGBROW_NEON 198 199#ifdef HAS_I422TOARGBROW_NEON 200void I422ToARGBRow_NEON(const uint8* src_y, 201 const uint8* src_u, 202 const uint8* src_v, 203 uint8* dst_argb, 204 int width) { 205 asm volatile ( 206 YUV422TORGB_SETUP_REG 207 "1: \n" 208 READYUV422 209 YUV422TORGB(v22, v21, v20) 210 "subs %w4, %w4, #8 \n" 211 "movi v23.8b, #255 \n" /* A */ 212 MEMACCESS(3) 213 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 214 "b.gt 1b \n" 215 : "+r"(src_y), // %0 216 "+r"(src_u), // %1 217 "+r"(src_v), // %2 218 "+r"(dst_argb), // %3 219 "+r"(width) // %4 220 : [kUVBiasBGR]"r"(&kUVBiasBGR), 221 [kYToRgb]"r"(&kYToRgb) 222 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 223 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 224 ); 225} 226#endif // HAS_I422TOARGBROW_NEON 227 228#ifdef HAS_I411TOARGBROW_NEON 229void I411ToARGBRow_NEON(const uint8* src_y, 230 const uint8* src_u, 231 const uint8* src_v, 232 uint8* dst_argb, 233 int width) { 234 asm volatile ( 235 YUV422TORGB_SETUP_REG 236 "1: \n" 237 READYUV411 238 YUV422TORGB(v22, v21, v20) 239 "subs %w4, %w4, #8 \n" 240 "movi v23.8b, #255 \n" /* A */ 241 MEMACCESS(3) 242 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 243 "b.gt 1b \n" 244 : "+r"(src_y), // %0 245 "+r"(src_u), // %1 246 "+r"(src_v), // %2 247 "+r"(dst_argb), // %3 248 "+r"(width) // %4 249 : [kUVBiasBGR]"r"(&kUVBiasBGR), 250 [kYToRgb]"r"(&kYToRgb) 251 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 252 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 253 ); 254} 255#endif // HAS_I411TOARGBROW_NEON 256 257#ifdef HAS_I422TOBGRAROW_NEON 258void I422ToBGRARow_NEON(const uint8* src_y, 259 const uint8* src_u, 260 const uint8* src_v, 261 uint8* dst_bgra, 262 int width) { 263 asm volatile ( 264 YUV422TORGB_SETUP_REG 265 "1: \n" 266 READYUV422 267 YUV422TORGB(v21, v22, v23) 268 "subs %w4, %w4, #8 \n" 269 "movi v20.8b, #255 \n" /* A */ 270 MEMACCESS(3) 271 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 272 "b.gt 1b \n" 273 : "+r"(src_y), // %0 274 "+r"(src_u), // %1 275 "+r"(src_v), // %2 276 "+r"(dst_bgra), // %3 277 "+r"(width) // %4 278 : [kUVBiasBGR]"r"(&kUVBiasBGR), 279 [kYToRgb]"r"(&kYToRgb) 280 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 281 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 282 ); 283} 284#endif // HAS_I422TOBGRAROW_NEON 285 286#ifdef HAS_I422TOABGRROW_NEON 287void I422ToABGRRow_NEON(const uint8* src_y, 288 const uint8* src_u, 289 const uint8* src_v, 290 uint8* dst_abgr, 291 int width) { 292 asm volatile ( 293 YUV422TORGB_SETUP_REG 294 "1: \n" 295 READYUV422 296 YUV422TORGB(v20, v21, v22) 297 "subs %w4, %w4, #8 \n" 298 "movi v23.8b, #255 \n" /* A */ 299 MEMACCESS(3) 300 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 301 "b.gt 1b \n" 302 : "+r"(src_y), // %0 303 "+r"(src_u), // %1 304 "+r"(src_v), // %2 305 "+r"(dst_abgr), // %3 306 "+r"(width) // %4 307 : [kUVBiasBGR]"r"(&kUVBiasBGR), 308 [kYToRgb]"r"(&kYToRgb) 309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 310 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 311 ); 312} 313#endif // HAS_I422TOABGRROW_NEON 314 315#ifdef HAS_I422TORGBAROW_NEON 316void I422ToRGBARow_NEON(const uint8* src_y, 317 const uint8* src_u, 318 const uint8* src_v, 319 uint8* dst_rgba, 320 int width) { 321 asm volatile ( 322 YUV422TORGB_SETUP_REG 323 "1: \n" 324 READYUV422 325 YUV422TORGB(v23, v22, v21) 326 "subs %w4, %w4, #8 \n" 327 "movi v20.8b, #255 \n" /* A */ 328 MEMACCESS(3) 329 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 330 "b.gt 1b \n" 331 : "+r"(src_y), // %0 332 "+r"(src_u), // %1 333 "+r"(src_v), // %2 334 "+r"(dst_rgba), // %3 335 "+r"(width) // %4 336 : [kUVBiasBGR]"r"(&kUVBiasBGR), 337 [kYToRgb]"r"(&kYToRgb) 338 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 339 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 340 ); 341} 342#endif // HAS_I422TORGBAROW_NEON 343 344#ifdef HAS_I422TORGB24ROW_NEON 345void I422ToRGB24Row_NEON(const uint8* src_y, 346 const uint8* src_u, 347 const uint8* src_v, 348 uint8* dst_rgb24, 349 int width) { 350 asm volatile ( 351 YUV422TORGB_SETUP_REG 352 "1: \n" 353 READYUV422 354 YUV422TORGB(v22, v21, v20) 355 "subs %w4, %w4, #8 \n" 356 MEMACCESS(3) 357 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 358 "b.gt 1b \n" 359 : "+r"(src_y), // %0 360 "+r"(src_u), // %1 361 "+r"(src_v), // %2 362 "+r"(dst_rgb24), // %3 363 "+r"(width) // %4 364 : [kUVBiasBGR]"r"(&kUVBiasBGR), 365 [kYToRgb]"r"(&kYToRgb) 366 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 367 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 368 ); 369} 370#endif // HAS_I422TORGB24ROW_NEON 371 372#ifdef HAS_I422TORAWROW_NEON 373void I422ToRAWRow_NEON(const uint8* src_y, 374 const uint8* src_u, 375 const uint8* src_v, 376 uint8* dst_raw, 377 int width) { 378 asm volatile ( 379 YUV422TORGB_SETUP_REG 380 "1: \n" 381 READYUV422 382 YUV422TORGB(v20, v21, v22) 383 "subs %w4, %w4, #8 \n" 384 MEMACCESS(3) 385 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 386 "b.gt 1b \n" 387 : "+r"(src_y), // %0 388 "+r"(src_u), // %1 389 "+r"(src_v), // %2 390 "+r"(dst_raw), // %3 391 "+r"(width) // %4 392 : [kUVBiasBGR]"r"(&kUVBiasBGR), 393 [kYToRgb]"r"(&kYToRgb) 394 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 395 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 396 ); 397} 398#endif // HAS_I422TORAWROW_NEON 399 400#define ARGBTORGB565 \ 401 "shll v0.8h, v22.8b, #8 \n" /* R */ \ 402 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 403 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 404 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ 405 "sri v0.8h, v20.8h, #11 \n" /* RGB */ 406 407#ifdef HAS_I422TORGB565ROW_NEON 408void I422ToRGB565Row_NEON(const uint8* src_y, 409 const uint8* src_u, 410 const uint8* src_v, 411 uint8* dst_rgb565, 412 int width) { 413 asm volatile ( 414 YUV422TORGB_SETUP_REG 415 "1: \n" 416 READYUV422 417 YUV422TORGB(v22, v21, v20) 418 "subs %w4, %w4, #8 \n" 419 ARGBTORGB565 420 MEMACCESS(3) 421 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 422 "b.gt 1b \n" 423 : "+r"(src_y), // %0 424 "+r"(src_u), // %1 425 "+r"(src_v), // %2 426 "+r"(dst_rgb565), // %3 427 "+r"(width) // %4 428 : [kUVBiasBGR]"r"(&kUVBiasBGR), 429 [kYToRgb]"r"(&kYToRgb) 430 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 431 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 432 ); 433} 434#endif // HAS_I422TORGB565ROW_NEON 435 436#define ARGBTOARGB1555 \ 437 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 438 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 439 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 440 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 441 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 442 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 443 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 444 445#ifdef HAS_I422TOARGB1555ROW_NEON 446void I422ToARGB1555Row_NEON(const uint8* src_y, 447 const uint8* src_u, 448 const uint8* src_v, 449 uint8* dst_argb1555, 450 int width) { 451 asm volatile ( 452 YUV422TORGB_SETUP_REG 453 "1: \n" 454 READYUV422 455 YUV422TORGB(v22, v21, v20) 456 "subs %w4, %w4, #8 \n" 457 "movi v23.8b, #255 \n" 458 ARGBTOARGB1555 459 MEMACCESS(3) 460 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 461 "b.gt 1b \n" 462 : "+r"(src_y), // %0 463 "+r"(src_u), // %1 464 "+r"(src_v), // %2 465 "+r"(dst_argb1555), // %3 466 "+r"(width) // %4 467 : [kUVBiasBGR]"r"(&kUVBiasBGR), 468 [kYToRgb]"r"(&kYToRgb) 469 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 470 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 471 ); 472} 473#endif // HAS_I422TOARGB1555ROW_NEON 474 475#define ARGBTOARGB4444 \ 476 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 477 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 478 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 479 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 480 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 481 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 482 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 483 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 484 485#ifdef HAS_I422TOARGB4444ROW_NEON 486void I422ToARGB4444Row_NEON(const uint8* src_y, 487 const uint8* src_u, 488 const uint8* src_v, 489 uint8* dst_argb4444, 490 int width) { 491 asm volatile ( 492 YUV422TORGB_SETUP_REG 493 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 494 "1: \n" 495 READYUV422 496 YUV422TORGB(v22, v21, v20) 497 "subs %w4, %w4, #8 \n" 498 "movi v23.8b, #255 \n" 499 ARGBTOARGB4444 500 MEMACCESS(3) 501 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. 502 "b.gt 1b \n" 503 : "+r"(src_y), // %0 504 "+r"(src_u), // %1 505 "+r"(src_v), // %2 506 "+r"(dst_argb4444), // %3 507 "+r"(width) // %4 508 : [kUVBiasBGR]"r"(&kUVBiasBGR), 509 [kYToRgb]"r"(&kYToRgb) 510 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 511 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 512 ); 513} 514#endif // HAS_I422TOARGB4444ROW_NEON 515 516#ifdef HAS_I400TOARGBROW_NEON 517void I400ToARGBRow_NEON(const uint8* src_y, 518 uint8* dst_argb, 519 int width) { 520 int64 width64 = (int64)(width); 521 asm volatile ( 522 YUV422TORGB_SETUP_REG 523 "1: \n" 524 READYUV400 525 YUV422TORGB(v22, v21, v20) 526 "subs %w2, %w2, #8 \n" 527 "movi v23.8b, #255 \n" 528 MEMACCESS(1) 529 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 530 "b.gt 1b \n" 531 : "+r"(src_y), // %0 532 "+r"(dst_argb), // %1 533 "+r"(width64) // %2 534 : [kUVBiasBGR]"r"(&kUVBiasBGR), 535 [kYToRgb]"r"(&kYToRgb) 536 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 537 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 538 ); 539} 540#endif // HAS_I400TOARGBROW_NEON 541 542#ifdef HAS_J400TOARGBROW_NEON 543void J400ToARGBRow_NEON(const uint8* src_y, 544 uint8* dst_argb, 545 int width) { 546 asm volatile ( 547 "movi v23.8b, #255 \n" 548 "1: \n" 549 MEMACCESS(0) 550 "ld1 {v20.8b}, [%0], #8 \n" 551 "orr v21.8b, v20.8b, v20.8b \n" 552 "orr v22.8b, v20.8b, v20.8b \n" 553 "subs %w2, %w2, #8 \n" 554 MEMACCESS(1) 555 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 556 "b.gt 1b \n" 557 : "+r"(src_y), // %0 558 "+r"(dst_argb), // %1 559 "+r"(width) // %2 560 : 561 : "cc", "memory", "v20", "v21", "v22", "v23" 562 ); 563} 564#endif // HAS_J400TOARGBROW_NEON 565 566#ifdef HAS_NV12TOARGBROW_NEON 567void NV12ToARGBRow_NEON(const uint8* src_y, 568 const uint8* src_uv, 569 uint8* dst_argb, 570 int width) { 571 asm volatile ( 572 YUV422TORGB_SETUP_REG 573 "1: \n" 574 READNV12 575 YUV422TORGB(v22, v21, v20) 576 "subs %w3, %w3, #8 \n" 577 "movi v23.8b, #255 \n" 578 MEMACCESS(2) 579 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 580 "b.gt 1b \n" 581 : "+r"(src_y), // %0 582 "+r"(src_uv), // %1 583 "+r"(dst_argb), // %2 584 "+r"(width) // %3 585 : [kUVBiasBGR]"r"(&kUVBiasBGR), 586 [kYToRgb]"r"(&kYToRgb) 587 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 588 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 589 ); 590} 591#endif // HAS_NV12TOARGBROW_NEON 592 593#ifdef HAS_NV21TOARGBROW_NEON 594void NV21ToARGBRow_NEON(const uint8* src_y, 595 const uint8* src_uv, 596 uint8* dst_argb, 597 int width) { 598 asm volatile ( 599 YUV422TORGB_SETUP_REG 600 "1: \n" 601 READNV21 602 YUV422TORGB(v22, v21, v20) 603 "subs %w3, %w3, #8 \n" 604 "movi v23.8b, #255 \n" 605 MEMACCESS(2) 606 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 607 "b.gt 1b \n" 608 : "+r"(src_y), // %0 609 "+r"(src_uv), // %1 610 "+r"(dst_argb), // %2 611 "+r"(width) // %3 612 : [kUVBiasBGR]"r"(&kUVBiasBGR), 613 [kYToRgb]"r"(&kYToRgb) 614 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 615 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 616 ); 617} 618#endif // HAS_NV21TOARGBROW_NEON 619 620#ifdef HAS_NV12TORGB565ROW_NEON 621void NV12ToRGB565Row_NEON(const uint8* src_y, 622 const uint8* src_uv, 623 uint8* dst_rgb565, 624 int width) { 625 asm volatile ( 626 YUV422TORGB_SETUP_REG 627 "1: \n" 628 READNV12 629 YUV422TORGB(v22, v21, v20) 630 "subs %w3, %w3, #8 \n" 631 ARGBTORGB565 632 MEMACCESS(2) 633 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 634 "b.gt 1b \n" 635 : "+r"(src_y), // %0 636 "+r"(src_uv), // %1 637 "+r"(dst_rgb565), // %2 638 "+r"(width) // %3 639 : [kUVBiasBGR]"r"(&kUVBiasBGR), 640 [kYToRgb]"r"(&kYToRgb) 641 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 642 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 643 ); 644} 645#endif // HAS_NV12TORGB565ROW_NEON 646 647#ifdef HAS_NV21TORGB565ROW_NEON 648void NV21ToRGB565Row_NEON(const uint8* src_y, 649 const uint8* src_uv, 650 uint8* dst_rgb565, 651 int width) { 652 asm volatile ( 653 YUV422TORGB_SETUP_REG 654 "1: \n" 655 READNV21 656 YUV422TORGB(v22, v21, v20) 657 "subs %w3, %w3, #8 \n" 658 ARGBTORGB565 659 MEMACCESS(2) 660 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 661 "b.gt 1b \n" 662 : "+r"(src_y), // %0 663 "+r"(src_uv), // %1 664 "+r"(dst_rgb565), // %2 665 "+r"(width) // %3 666 : [kUVBiasBGR]"r"(&kUVBiasBGR), 667 [kYToRgb]"r"(&kYToRgb) 668 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 669 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 670 ); 671} 672#endif // HAS_NV21TORGB565ROW_NEON 673 674#ifdef HAS_YUY2TOARGBROW_NEON 675void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 676 uint8* dst_argb, 677 int width) { 678 int64 width64 = (int64)(width); 679 asm volatile ( 680 YUV422TORGB_SETUP_REG 681 "1: \n" 682 READYUY2 683 YUV422TORGB(v22, v21, v20) 684 "subs %w2, %w2, #8 \n" 685 "movi v23.8b, #255 \n" 686 MEMACCESS(1) 687 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 688 "b.gt 1b \n" 689 : "+r"(src_yuy2), // %0 690 "+r"(dst_argb), // %1 691 "+r"(width64) // %2 692 : [kUVBiasBGR]"r"(&kUVBiasBGR), 693 [kYToRgb]"r"(&kYToRgb) 694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 695 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 696 ); 697} 698#endif // HAS_YUY2TOARGBROW_NEON 699 700#ifdef HAS_UYVYTOARGBROW_NEON 701void UYVYToARGBRow_NEON(const uint8* src_uyvy, 702 uint8* dst_argb, 703 int width) { 704 int64 width64 = (int64)(width); 705 asm volatile ( 706 YUV422TORGB_SETUP_REG 707 "1: \n" 708 READUYVY 709 YUV422TORGB(v22, v21, v20) 710 "subs %w2, %w2, #8 \n" 711 "movi v23.8b, #255 \n" 712 MEMACCESS(1) 713 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 714 "b.gt 1b \n" 715 : "+r"(src_uyvy), // %0 716 "+r"(dst_argb), // %1 717 "+r"(width64) // %2 718 : [kUVBiasBGR]"r"(&kUVBiasBGR), 719 [kYToRgb]"r"(&kYToRgb) 720 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 721 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 722 ); 723} 724#endif // HAS_UYVYTOARGBROW_NEON 725 726// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 727#ifdef HAS_SPLITUVROW_NEON 728void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 729 int width) { 730 asm volatile ( 731 "1: \n" 732 MEMACCESS(0) 733 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 734 "subs %w3, %w3, #16 \n" // 16 processed per loop 735 MEMACCESS(1) 736 "st1 {v0.16b}, [%1], #16 \n" // store U 737 MEMACCESS(2) 738 "st1 {v1.16b}, [%2], #16 \n" // store V 739 "b.gt 1b \n" 740 : "+r"(src_uv), // %0 741 "+r"(dst_u), // %1 742 "+r"(dst_v), // %2 743 "+r"(width) // %3 // Output registers 744 : // Input registers 745 : "cc", "memory", "v0", "v1" // Clobber List 746 ); 747} 748#endif // HAS_SPLITUVROW_NEON 749 750// Reads 16 U's and V's and writes out 16 pairs of UV. 751#ifdef HAS_MERGEUVROW_NEON 752void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 753 int width) { 754 asm volatile ( 755 "1: \n" 756 MEMACCESS(0) 757 "ld1 {v0.16b}, [%0], #16 \n" // load U 758 MEMACCESS(1) 759 "ld1 {v1.16b}, [%1], #16 \n" // load V 760 "subs %w3, %w3, #16 \n" // 16 processed per loop 761 MEMACCESS(2) 762 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 763 "b.gt 1b \n" 764 : 765 "+r"(src_u), // %0 766 "+r"(src_v), // %1 767 "+r"(dst_uv), // %2 768 "+r"(width) // %3 // Output registers 769 : // Input registers 770 : "cc", "memory", "v0", "v1" // Clobber List 771 ); 772} 773#endif // HAS_MERGEUVROW_NEON 774 775// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 776#ifdef HAS_COPYROW_NEON 777void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 778 asm volatile ( 779 "1: \n" 780 MEMACCESS(0) 781 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 782 "subs %w2, %w2, #32 \n" // 32 processed per loop 783 MEMACCESS(1) 784 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 785 "b.gt 1b \n" 786 : "+r"(src), // %0 787 "+r"(dst), // %1 788 "+r"(count) // %2 // Output registers 789 : // Input registers 790 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 791 ); 792} 793#endif // HAS_COPYROW_NEON 794 795// SetRow writes 'count' bytes using an 8 bit value repeated. 796void SetRow_NEON(uint8* dst, uint8 v8, int count) { 797 asm volatile ( 798 "dup v0.16b, %w2 \n" // duplicate 16 bytes 799 "1: \n" 800 "subs %w1, %w1, #16 \n" // 16 bytes per loop 801 MEMACCESS(0) 802 "st1 {v0.16b}, [%0], #16 \n" // store 803 "b.gt 1b \n" 804 : "+r"(dst), // %0 805 "+r"(count) // %1 806 : "r"(v8) // %2 807 : "cc", "memory", "v0" 808 ); 809} 810 811void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { 812 asm volatile ( 813 "dup v0.4s, %w2 \n" // duplicate 4 ints 814 "1: \n" 815 "subs %w1, %w1, #4 \n" // 4 ints per loop 816 MEMACCESS(0) 817 "st1 {v0.16b}, [%0], #16 \n" // store 818 "b.gt 1b \n" 819 : "+r"(dst), // %0 820 "+r"(count) // %1 821 : "r"(v32) // %2 822 : "cc", "memory", "v0" 823 ); 824} 825 826#ifdef HAS_MIRRORROW_NEON 827void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 828 int64 width64 = (int64) width; 829 asm volatile ( 830 // Start at end of source row. 831 "add %0, %0, %2 \n" 832 "sub %0, %0, #16 \n" 833 834 "1: \n" 835 MEMACCESS(0) 836 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 837 "subs %2, %2, #16 \n" // 16 pixels per loop. 838 "rev64 v0.16b, v0.16b \n" 839 MEMACCESS(1) 840 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 841 MEMACCESS(1) 842 "st1 {v0.D}[0], [%1], #8 \n" 843 "b.gt 1b \n" 844 : "+r"(src), // %0 845 "+r"(dst), // %1 846 "+r"(width64) // %2 847 : "r"((ptrdiff_t)-16) // %3 848 : "cc", "memory", "v0" 849 ); 850} 851#endif // HAS_MIRRORROW_NEON 852 853#ifdef HAS_MIRRORUVROW_NEON 854void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 855 int width) { 856 int64 width64 = (int64) width; 857 asm volatile ( 858 // Start at end of source row. 859 "add %0, %0, %3, lsl #1 \n" 860 "sub %0, %0, #16 \n" 861 862 "1: \n" 863 MEMACCESS(0) 864 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 865 "subs %3, %3, #8 \n" // 8 pixels per loop. 866 "rev64 v0.8b, v0.8b \n" 867 "rev64 v1.8b, v1.8b \n" 868 MEMACCESS(1) 869 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 870 MEMACCESS(2) 871 "st1 {v1.8b}, [%2], #8 \n" 872 "b.gt 1b \n" 873 : "+r"(src_uv), // %0 874 "+r"(dst_u), // %1 875 "+r"(dst_v), // %2 876 "+r"(width64) // %3 877 : "r"((ptrdiff_t)-16) // %4 878 : "cc", "memory", "v0", "v1" 879 ); 880} 881#endif // HAS_MIRRORUVROW_NEON 882 883#ifdef HAS_ARGBMIRRORROW_NEON 884void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 885 int64 width64 = (int64) width; 886 asm volatile ( 887 // Start at end of source row. 888 "add %0, %0, %2, lsl #2 \n" 889 "sub %0, %0, #16 \n" 890 891 "1: \n" 892 MEMACCESS(0) 893 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 894 "subs %2, %2, #4 \n" // 4 pixels per loop. 895 "rev64 v0.4s, v0.4s \n" 896 MEMACCESS(1) 897 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 898 MEMACCESS(1) 899 "st1 {v0.D}[0], [%1], #8 \n" 900 "b.gt 1b \n" 901 : "+r"(src), // %0 902 "+r"(dst), // %1 903 "+r"(width64) // %2 904 : "r"((ptrdiff_t)-16) // %3 905 : "cc", "memory", "v0" 906 ); 907} 908#endif // HAS_ARGBMIRRORROW_NEON 909 910#ifdef HAS_RGB24TOARGBROW_NEON 911void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { 912 asm volatile ( 913 "movi v4.8b, #255 \n" // Alpha 914 "1: \n" 915 MEMACCESS(0) 916 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 917 "subs %w2, %w2, #8 \n" // 8 processed per loop. 918 MEMACCESS(1) 919 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 920 "b.gt 1b \n" 921 : "+r"(src_rgb24), // %0 922 "+r"(dst_argb), // %1 923 "+r"(pix) // %2 924 : 925 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 926 ); 927} 928#endif // HAS_RGB24TOARGBROW_NEON 929 930#ifdef HAS_RAWTOARGBROW_NEON 931void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { 932 asm volatile ( 933 "movi v5.8b, #255 \n" // Alpha 934 "1: \n" 935 MEMACCESS(0) 936 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 937 "subs %w2, %w2, #8 \n" // 8 processed per loop. 938 "orr v3.8b, v1.8b, v1.8b \n" // move g 939 "orr v4.8b, v0.8b, v0.8b \n" // move r 940 MEMACCESS(1) 941 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 942 "b.gt 1b \n" 943 : "+r"(src_raw), // %0 944 "+r"(dst_argb), // %1 945 "+r"(pix) // %2 946 : 947 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 948 ); 949} 950#endif // HAS_RAWTOARGBROW_NEON 951 952#define RGB565TOARGB \ 953 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 954 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 955 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 956 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 957 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 958 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 959 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 960 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 961 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 962 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 963 "dup v2.2D, v0.D[1] \n" /* R */ 964 965#ifdef HAS_RGB565TOARGBROW_NEON 966void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { 967 asm volatile ( 968 "movi v3.8b, #255 \n" // Alpha 969 "1: \n" 970 MEMACCESS(0) 971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 972 "subs %w2, %w2, #8 \n" // 8 processed per loop. 973 RGB565TOARGB 974 MEMACCESS(1) 975 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 976 "b.gt 1b \n" 977 : "+r"(src_rgb565), // %0 978 "+r"(dst_argb), // %1 979 "+r"(pix) // %2 980 : 981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 982 ); 983} 984#endif // HAS_RGB565TOARGBROW_NEON 985 986#define ARGB1555TOARGB \ 987 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 988 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 989 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 990 \ 991 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 992 "xtn2 v3.16b, v2.8h \n" \ 993 \ 994 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 995 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 996 \ 997 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ 998 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 999 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 1000 \ 1001 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 1002 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ 1003 "dup v1.2D, v0.D[1] \n" \ 1004 "dup v3.2D, v2.D[1] \n" 1005 1006// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 1007#define RGB555TOARGB \ 1008 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 1009 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 1010 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ 1011 \ 1012 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 1013 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 1014 \ 1015 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 1016 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 1017 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 1018 \ 1019 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 1020 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 1021 "dup v1.2D, v0.D[1] \n" /* G */ \ 1022 1023#ifdef HAS_ARGB1555TOARGBROW_NEON 1024void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 1025 int pix) { 1026 asm volatile ( 1027 "movi v3.8b, #255 \n" // Alpha 1028 "1: \n" 1029 MEMACCESS(0) 1030 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1031 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1032 ARGB1555TOARGB 1033 MEMACCESS(1) 1034 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1035 "b.gt 1b \n" 1036 : "+r"(src_argb1555), // %0 1037 "+r"(dst_argb), // %1 1038 "+r"(pix) // %2 1039 : 1040 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1041 ); 1042} 1043#endif // HAS_ARGB1555TOARGBROW_NEON 1044 1045#define ARGB4444TOARGB \ 1046 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 1047 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 1048 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 1049 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 1050 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 1051 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 1052 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 1053 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 1054 "dup v0.2D, v2.D[1] \n" \ 1055 "dup v1.2D, v3.D[1] \n" 1056 1057#ifdef HAS_ARGB4444TOARGBROW_NEON 1058void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 1059 int pix) { 1060 asm volatile ( 1061 "1: \n" 1062 MEMACCESS(0) 1063 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1064 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1065 ARGB4444TOARGB 1066 MEMACCESS(1) 1067 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 1068 "b.gt 1b \n" 1069 : "+r"(src_argb4444), // %0 1070 "+r"(dst_argb), // %1 1071 "+r"(pix) // %2 1072 : 1073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 1074 ); 1075} 1076#endif // HAS_ARGB4444TOARGBROW_NEON 1077 1078#ifdef HAS_ARGBTORGB24ROW_NEON 1079void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { 1080 asm volatile ( 1081 "1: \n" 1082 MEMACCESS(0) 1083 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 1084 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1085 MEMACCESS(1) 1086 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 1087 "b.gt 1b \n" 1088 : "+r"(src_argb), // %0 1089 "+r"(dst_rgb24), // %1 1090 "+r"(pix) // %2 1091 : 1092 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 1093 ); 1094} 1095#endif // HAS_ARGBTORGB24ROW_NEON 1096 1097#ifdef HAS_ARGBTORAWROW_NEON 1098void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { 1099 asm volatile ( 1100 "1: \n" 1101 MEMACCESS(0) 1102 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 1103 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1104 "orr v4.8b, v2.8b, v2.8b \n" // mov g 1105 "orr v5.8b, v1.8b, v1.8b \n" // mov b 1106 MEMACCESS(1) 1107 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 1108 "b.gt 1b \n" 1109 : "+r"(src_argb), // %0 1110 "+r"(dst_raw), // %1 1111 "+r"(pix) // %2 1112 : 1113 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 1114 ); 1115} 1116#endif // HAS_ARGBTORAWROW_NEON 1117 1118#ifdef HAS_YUY2TOYROW_NEON 1119void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { 1120 asm volatile ( 1121 "1: \n" 1122 MEMACCESS(0) 1123 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 1124 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1125 MEMACCESS(1) 1126 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1127 "b.gt 1b \n" 1128 : "+r"(src_yuy2), // %0 1129 "+r"(dst_y), // %1 1130 "+r"(pix) // %2 1131 : 1132 : "cc", "memory", "v0", "v1" // Clobber List 1133 ); 1134} 1135#endif // HAS_YUY2TOYROW_NEON 1136 1137#ifdef HAS_UYVYTOYROW_NEON 1138void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { 1139 asm volatile ( 1140 "1: \n" 1141 MEMACCESS(0) 1142 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1143 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1144 MEMACCESS(1) 1145 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1146 "b.gt 1b \n" 1147 : "+r"(src_uyvy), // %0 1148 "+r"(dst_y), // %1 1149 "+r"(pix) // %2 1150 : 1151 : "cc", "memory", "v0", "v1" // Clobber List 1152 ); 1153} 1154#endif // HAS_UYVYTOYROW_NEON 1155 1156#ifdef HAS_YUY2TOUV422ROW_NEON 1157void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1158 int pix) { 1159 asm volatile ( 1160 "1: \n" 1161 MEMACCESS(0) 1162 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1163 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1164 MEMACCESS(1) 1165 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1166 MEMACCESS(2) 1167 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1168 "b.gt 1b \n" 1169 : "+r"(src_yuy2), // %0 1170 "+r"(dst_u), // %1 1171 "+r"(dst_v), // %2 1172 "+r"(pix) // %3 1173 : 1174 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1175 ); 1176} 1177#endif // HAS_YUY2TOUV422ROW_NEON 1178 1179#ifdef HAS_UYVYTOUV422ROW_NEON 1180void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1181 int pix) { 1182 asm volatile ( 1183 "1: \n" 1184 MEMACCESS(0) 1185 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1186 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1187 MEMACCESS(1) 1188 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1189 MEMACCESS(2) 1190 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1191 "b.gt 1b \n" 1192 : "+r"(src_uyvy), // %0 1193 "+r"(dst_u), // %1 1194 "+r"(dst_v), // %2 1195 "+r"(pix) // %3 1196 : 1197 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1198 ); 1199} 1200#endif // HAS_UYVYTOUV422ROW_NEON 1201 1202#ifdef HAS_YUY2TOUVROW_NEON 1203void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1204 uint8* dst_u, uint8* dst_v, int pix) { 1205 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1206 asm volatile ( 1207 "1: \n" 1208 MEMACCESS(0) 1209 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1210 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1211 MEMACCESS(1) 1212 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1213 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1214 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1215 MEMACCESS(2) 1216 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1217 MEMACCESS(3) 1218 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1219 "b.gt 1b \n" 1220 : "+r"(src_yuy2), // %0 1221 "+r"(src_yuy2b), // %1 1222 "+r"(dst_u), // %2 1223 "+r"(dst_v), // %3 1224 "+r"(pix) // %4 1225 : 1226 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1227 "v5", "v6", "v7" // Clobber List 1228 ); 1229} 1230#endif // HAS_YUY2TOUVROW_NEON 1231 1232#ifdef HAS_UYVYTOUVROW_NEON 1233void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1234 uint8* dst_u, uint8* dst_v, int pix) { 1235 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1236 asm volatile ( 1237 "1: \n" 1238 MEMACCESS(0) 1239 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1240 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1241 MEMACCESS(1) 1242 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1243 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1244 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1245 MEMACCESS(2) 1246 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1247 MEMACCESS(3) 1248 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1249 "b.gt 1b \n" 1250 : "+r"(src_uyvy), // %0 1251 "+r"(src_uyvyb), // %1 1252 "+r"(dst_u), // %2 1253 "+r"(dst_v), // %3 1254 "+r"(pix) // %4 1255 : 1256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1257 "v5", "v6", "v7" // Clobber List 1258 ); 1259} 1260#endif // HAS_UYVYTOUVROW_NEON 1261 1262// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1263#ifdef HAS_ARGBSHUFFLEROW_NEON 1264void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1265 const uint8* shuffler, int pix) { 1266 asm volatile ( 1267 MEMACCESS(3) 1268 "ld1 {v2.16b}, [%3] \n" // shuffler 1269 "1: \n" 1270 MEMACCESS(0) 1271 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1272 "subs %w2, %w2, #4 \n" // 4 processed per loop 1273 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1274 MEMACCESS(1) 1275 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1276 "b.gt 1b \n" 1277 : "+r"(src_argb), // %0 1278 "+r"(dst_argb), // %1 1279 "+r"(pix) // %2 1280 : "r"(shuffler) // %3 1281 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1282 ); 1283} 1284#endif // HAS_ARGBSHUFFLEROW_NEON 1285 1286#ifdef HAS_I422TOYUY2ROW_NEON 1287void I422ToYUY2Row_NEON(const uint8* src_y, 1288 const uint8* src_u, 1289 const uint8* src_v, 1290 uint8* dst_yuy2, int width) { 1291 asm volatile ( 1292 "1: \n" 1293 MEMACCESS(0) 1294 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1295 "orr v2.8b, v1.8b, v1.8b \n" 1296 MEMACCESS(1) 1297 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1298 MEMACCESS(2) 1299 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1300 "subs %w4, %w4, #16 \n" // 16 pixels 1301 MEMACCESS(3) 1302 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1303 "b.gt 1b \n" 1304 : "+r"(src_y), // %0 1305 "+r"(src_u), // %1 1306 "+r"(src_v), // %2 1307 "+r"(dst_yuy2), // %3 1308 "+r"(width) // %4 1309 : 1310 : "cc", "memory", "v0", "v1", "v2", "v3" 1311 ); 1312} 1313#endif // HAS_I422TOYUY2ROW_NEON 1314 1315#ifdef HAS_I422TOUYVYROW_NEON 1316void I422ToUYVYRow_NEON(const uint8* src_y, 1317 const uint8* src_u, 1318 const uint8* src_v, 1319 uint8* dst_uyvy, int width) { 1320 asm volatile ( 1321 "1: \n" 1322 MEMACCESS(0) 1323 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1324 "orr v3.8b, v2.8b, v2.8b \n" 1325 MEMACCESS(1) 1326 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1327 MEMACCESS(2) 1328 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1329 "subs %w4, %w4, #16 \n" // 16 pixels 1330 MEMACCESS(3) 1331 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1332 "b.gt 1b \n" 1333 : "+r"(src_y), // %0 1334 "+r"(src_u), // %1 1335 "+r"(src_v), // %2 1336 "+r"(dst_uyvy), // %3 1337 "+r"(width) // %4 1338 : 1339 : "cc", "memory", "v0", "v1", "v2", "v3" 1340 ); 1341} 1342#endif // HAS_I422TOUYVYROW_NEON 1343 1344#ifdef HAS_ARGBTORGB565ROW_NEON 1345void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { 1346 asm volatile ( 1347 "1: \n" 1348 MEMACCESS(0) 1349 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1350 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1351 ARGBTORGB565 1352 MEMACCESS(1) 1353 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1354 "b.gt 1b \n" 1355 : "+r"(src_argb), // %0 1356 "+r"(dst_rgb565), // %1 1357 "+r"(pix) // %2 1358 : 1359 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1360 ); 1361} 1362#endif // HAS_ARGBTORGB565ROW_NEON 1363 1364#ifdef HAS_ARGBTORGB565DITHERROW_NEON 1365void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, 1366 const uint32 dither4, int width) { 1367 asm volatile ( 1368 "dup v1.4s, %w2 \n" // dither4 1369 "1: \n" 1370 MEMACCESS(1) 1371 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1372 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1373 "uqadd v20.8b, v20.8b, v1.8b \n" 1374 "uqadd v21.8b, v21.8b, v1.8b \n" 1375 "uqadd v22.8b, v22.8b, v1.8b \n" 1376 ARGBTORGB565 1377 MEMACCESS(0) 1378 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1379 "b.gt 1b \n" 1380 : "+r"(dst_rgb) // %0 1381 : "r"(src_argb), // %1 1382 "r"(dither4), // %2 1383 "r"(width) // %3 1384 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1385 ); 1386} 1387#endif // HAS_ARGBTORGB565ROW_NEON 1388 1389#ifdef HAS_ARGBTOARGB1555ROW_NEON 1390void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1391 int pix) { 1392 asm volatile ( 1393 "1: \n" 1394 MEMACCESS(0) 1395 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1396 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1397 ARGBTOARGB1555 1398 MEMACCESS(1) 1399 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1400 "b.gt 1b \n" 1401 : "+r"(src_argb), // %0 1402 "+r"(dst_argb1555), // %1 1403 "+r"(pix) // %2 1404 : 1405 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1406 ); 1407} 1408#endif // HAS_ARGBTOARGB1555ROW_NEON 1409 1410#ifdef HAS_ARGBTOARGB4444ROW_NEON 1411void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1412 int pix) { 1413 asm volatile ( 1414 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1415 "1: \n" 1416 MEMACCESS(0) 1417 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1418 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1419 ARGBTOARGB4444 1420 MEMACCESS(1) 1421 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1422 "b.gt 1b \n" 1423 : "+r"(src_argb), // %0 1424 "+r"(dst_argb4444), // %1 1425 "+r"(pix) // %2 1426 : 1427 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1428 ); 1429} 1430#endif // HAS_ARGBTOARGB4444ROW_NEON 1431 1432#ifdef HAS_ARGBTOYROW_NEON 1433void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1434 asm volatile ( 1435 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1436 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1437 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1438 "movi v7.8b, #16 \n" // Add 16 constant 1439 "1: \n" 1440 MEMACCESS(0) 1441 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1442 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1443 "umull v3.8h, v0.8b, v4.8b \n" // B 1444 "umlal v3.8h, v1.8b, v5.8b \n" // G 1445 "umlal v3.8h, v2.8b, v6.8b \n" // R 1446 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1447 "uqadd v0.8b, v0.8b, v7.8b \n" 1448 MEMACCESS(1) 1449 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1450 "b.gt 1b \n" 1451 : "+r"(src_argb), // %0 1452 "+r"(dst_y), // %1 1453 "+r"(pix) // %2 1454 : 1455 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1456 ); 1457} 1458#endif // HAS_ARGBTOYROW_NEON 1459 1460#ifdef HAS_ARGBTOYJROW_NEON 1461void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { 1462 asm volatile ( 1463 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1464 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1465 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1466 "1: \n" 1467 MEMACCESS(0) 1468 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1469 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1470 "umull v3.8h, v0.8b, v4.8b \n" // B 1471 "umlal v3.8h, v1.8b, v5.8b \n" // G 1472 "umlal v3.8h, v2.8b, v6.8b \n" // R 1473 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1474 MEMACCESS(1) 1475 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1476 "b.gt 1b \n" 1477 : "+r"(src_argb), // %0 1478 "+r"(dst_y), // %1 1479 "+r"(pix) // %2 1480 : 1481 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1482 ); 1483} 1484#endif // HAS_ARGBTOYJROW_NEON 1485 1486// 8x1 pixels. 1487#ifdef HAS_ARGBTOUV444ROW_NEON 1488void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1489 int pix) { 1490 asm volatile ( 1491 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1492 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1493 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1494 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1495 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1496 "movi v29.16b,#0x80 \n" // 128.5 1497 "1: \n" 1498 MEMACCESS(0) 1499 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1500 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1501 "umull v4.8h, v0.8b, v24.8b \n" // B 1502 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1503 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1504 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1505 1506 "umull v3.8h, v2.8b, v24.8b \n" // R 1507 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1508 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1509 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1510 1511 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U 1512 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1513 1514 MEMACCESS(1) 1515 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1516 MEMACCESS(2) 1517 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1518 "b.gt 1b \n" 1519 : "+r"(src_argb), // %0 1520 "+r"(dst_u), // %1 1521 "+r"(dst_v), // %2 1522 "+r"(pix) // %3 1523 : 1524 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1525 "v24", "v25", "v26", "v27", "v28", "v29" 1526 ); 1527} 1528#endif // HAS_ARGBTOUV444ROW_NEON 1529 1530// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1531#ifdef HAS_ARGBTOUV422ROW_NEON 1532void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1533 int pix) { 1534 asm volatile ( 1535 RGBTOUV_SETUP_REG 1536 "1: \n" 1537 MEMACCESS(0) 1538 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1539 1540 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1541 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1542 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1543 1544 "subs %w3, %w3, #16 \n" // 16 processed per loop. 1545 "mul v3.8h, v0.8h, v20.8h \n" // B 1546 "mls v3.8h, v1.8h, v21.8h \n" // G 1547 "mls v3.8h, v2.8h, v22.8h \n" // R 1548 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1549 1550 "mul v4.8h, v2.8h, v20.8h \n" // R 1551 "mls v4.8h, v1.8h, v24.8h \n" // G 1552 "mls v4.8h, v0.8h, v23.8h \n" // B 1553 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1554 1555 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1556 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1557 1558 MEMACCESS(1) 1559 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1560 MEMACCESS(2) 1561 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1562 "b.gt 1b \n" 1563 : "+r"(src_argb), // %0 1564 "+r"(dst_u), // %1 1565 "+r"(dst_v), // %2 1566 "+r"(pix) // %3 1567 : 1568 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1569 "v20", "v21", "v22", "v23", "v24", "v25" 1570 ); 1571} 1572#endif // HAS_ARGBTOUV422ROW_NEON 1573 1574// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. 1575#ifdef HAS_ARGBTOUV411ROW_NEON 1576void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1577 int pix) { 1578 asm volatile ( 1579 RGBTOUV_SETUP_REG 1580 "1: \n" 1581 MEMACCESS(0) 1582 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1583 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1584 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1585 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1586 MEMACCESS(0) 1587 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. 1588 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1589 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1590 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1591 1592 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. 1593 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. 1594 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. 1595 1596 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1597 "urshr v1.8h, v1.8h, #1 \n" 1598 "urshr v2.8h, v2.8h, #1 \n" 1599 1600 "subs %w3, %w3, #32 \n" // 32 processed per loop. 1601 "mul v3.8h, v0.8h, v20.8h \n" // B 1602 "mls v3.8h, v1.8h, v21.8h \n" // G 1603 "mls v3.8h, v2.8h, v22.8h \n" // R 1604 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1605 "mul v4.8h, v2.8h, v20.8h \n" // R 1606 "mls v4.8h, v1.8h, v24.8h \n" // G 1607 "mls v4.8h, v0.8h, v23.8h \n" // B 1608 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1609 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1610 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1611 MEMACCESS(1) 1612 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1613 MEMACCESS(2) 1614 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1615 "b.gt 1b \n" 1616 : "+r"(src_argb), // %0 1617 "+r"(dst_u), // %1 1618 "+r"(dst_v), // %2 1619 "+r"(pix) // %3 1620 : 1621 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1622 "v20", "v21", "v22", "v23", "v24", "v25" 1623 ); 1624} 1625#endif // HAS_ARGBTOUV411ROW_NEON 1626 1627// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1628#define RGBTOUV(QB, QG, QR) \ 1629 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1630 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1631 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1632 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1633 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1634 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1635 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1636 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1637 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1638 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1639 1640// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1641// TODO(fbarchard): consider ptrdiff_t for all strides. 1642 1643#ifdef HAS_ARGBTOUVROW_NEON 1644void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1645 uint8* dst_u, uint8* dst_v, int pix) { 1646 const uint8* src_argb_1 = src_argb + src_stride_argb; 1647 asm volatile ( 1648 RGBTOUV_SETUP_REG 1649 "1: \n" 1650 MEMACCESS(0) 1651 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1652 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1653 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1654 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1655 1656 MEMACCESS(1) 1657 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1658 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1659 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1660 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1661 1662 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1663 "urshr v1.8h, v1.8h, #1 \n" 1664 "urshr v2.8h, v2.8h, #1 \n" 1665 1666 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1667 RGBTOUV(v0.8h, v1.8h, v2.8h) 1668 MEMACCESS(2) 1669 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1670 MEMACCESS(3) 1671 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1672 "b.gt 1b \n" 1673 : "+r"(src_argb), // %0 1674 "+r"(src_argb_1), // %1 1675 "+r"(dst_u), // %2 1676 "+r"(dst_v), // %3 1677 "+r"(pix) // %4 1678 : 1679 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1680 "v20", "v21", "v22", "v23", "v24", "v25" 1681 ); 1682} 1683#endif // HAS_ARGBTOUVROW_NEON 1684 1685// TODO(fbarchard): Subsample match C code. 1686#ifdef HAS_ARGBTOUVJROW_NEON 1687void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1688 uint8* dst_u, uint8* dst_v, int pix) { 1689 const uint8* src_argb_1 = src_argb + src_stride_argb; 1690 asm volatile ( 1691 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 1692 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 1693 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 1694 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 1695 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 1696 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1697 "1: \n" 1698 MEMACCESS(0) 1699 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1700 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1701 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1702 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1703 MEMACCESS(1) 1704 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1705 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1706 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1707 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1708 1709 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1710 "urshr v1.8h, v1.8h, #1 \n" 1711 "urshr v2.8h, v2.8h, #1 \n" 1712 1713 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1714 RGBTOUV(v0.8h, v1.8h, v2.8h) 1715 MEMACCESS(2) 1716 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1717 MEMACCESS(3) 1718 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1719 "b.gt 1b \n" 1720 : "+r"(src_argb), // %0 1721 "+r"(src_argb_1), // %1 1722 "+r"(dst_u), // %2 1723 "+r"(dst_v), // %3 1724 "+r"(pix) // %4 1725 : 1726 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1727 "v20", "v21", "v22", "v23", "v24", "v25" 1728 ); 1729} 1730#endif // HAS_ARGBTOUVJROW_NEON 1731 1732#ifdef HAS_BGRATOUVROW_NEON 1733void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1734 uint8* dst_u, uint8* dst_v, int pix) { 1735 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1736 asm volatile ( 1737 RGBTOUV_SETUP_REG 1738 "1: \n" 1739 MEMACCESS(0) 1740 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1741 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1742 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1743 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. 1744 MEMACCESS(1) 1745 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more 1746 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. 1747 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1748 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. 1749 1750 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1751 "urshr v1.8h, v3.8h, #1 \n" 1752 "urshr v2.8h, v2.8h, #1 \n" 1753 1754 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1755 RGBTOUV(v0.8h, v1.8h, v2.8h) 1756 MEMACCESS(2) 1757 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1758 MEMACCESS(3) 1759 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1760 "b.gt 1b \n" 1761 : "+r"(src_bgra), // %0 1762 "+r"(src_bgra_1), // %1 1763 "+r"(dst_u), // %2 1764 "+r"(dst_v), // %3 1765 "+r"(pix) // %4 1766 : 1767 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1768 "v20", "v21", "v22", "v23", "v24", "v25" 1769 ); 1770} 1771#endif // HAS_BGRATOUVROW_NEON 1772 1773#ifdef HAS_ABGRTOUVROW_NEON 1774void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1775 uint8* dst_u, uint8* dst_v, int pix) { 1776 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1777 asm volatile ( 1778 RGBTOUV_SETUP_REG 1779 "1: \n" 1780 MEMACCESS(0) 1781 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1782 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1783 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1784 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1785 MEMACCESS(1) 1786 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1787 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1788 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1789 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1790 1791 "urshr v0.8h, v3.8h, #1 \n" // 2x average 1792 "urshr v2.8h, v2.8h, #1 \n" 1793 "urshr v1.8h, v1.8h, #1 \n" 1794 1795 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1796 RGBTOUV(v0.8h, v2.8h, v1.8h) 1797 MEMACCESS(2) 1798 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1799 MEMACCESS(3) 1800 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1801 "b.gt 1b \n" 1802 : "+r"(src_abgr), // %0 1803 "+r"(src_abgr_1), // %1 1804 "+r"(dst_u), // %2 1805 "+r"(dst_v), // %3 1806 "+r"(pix) // %4 1807 : 1808 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1809 "v20", "v21", "v22", "v23", "v24", "v25" 1810 ); 1811} 1812#endif // HAS_ABGRTOUVROW_NEON 1813 1814#ifdef HAS_RGBATOUVROW_NEON 1815void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1816 uint8* dst_u, uint8* dst_v, int pix) { 1817 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1818 asm volatile ( 1819 RGBTOUV_SETUP_REG 1820 "1: \n" 1821 MEMACCESS(0) 1822 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1823 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1824 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1825 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. 1826 MEMACCESS(1) 1827 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1828 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. 1829 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1830 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. 1831 1832 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1833 "urshr v1.8h, v1.8h, #1 \n" 1834 "urshr v2.8h, v2.8h, #1 \n" 1835 1836 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1837 RGBTOUV(v0.8h, v1.8h, v2.8h) 1838 MEMACCESS(2) 1839 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1840 MEMACCESS(3) 1841 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1842 "b.gt 1b \n" 1843 : "+r"(src_rgba), // %0 1844 "+r"(src_rgba_1), // %1 1845 "+r"(dst_u), // %2 1846 "+r"(dst_v), // %3 1847 "+r"(pix) // %4 1848 : 1849 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1850 "v20", "v21", "v22", "v23", "v24", "v25" 1851 ); 1852} 1853#endif // HAS_RGBATOUVROW_NEON 1854 1855#ifdef HAS_RGB24TOUVROW_NEON 1856void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1857 uint8* dst_u, uint8* dst_v, int pix) { 1858 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1859 asm volatile ( 1860 RGBTOUV_SETUP_REG 1861 "1: \n" 1862 MEMACCESS(0) 1863 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1864 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1865 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1866 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1867 MEMACCESS(1) 1868 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. 1869 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1870 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1871 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1872 1873 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1874 "urshr v1.8h, v1.8h, #1 \n" 1875 "urshr v2.8h, v2.8h, #1 \n" 1876 1877 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1878 RGBTOUV(v0.8h, v1.8h, v2.8h) 1879 MEMACCESS(2) 1880 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1881 MEMACCESS(3) 1882 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1883 "b.gt 1b \n" 1884 : "+r"(src_rgb24), // %0 1885 "+r"(src_rgb24_1), // %1 1886 "+r"(dst_u), // %2 1887 "+r"(dst_v), // %3 1888 "+r"(pix) // %4 1889 : 1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1891 "v20", "v21", "v22", "v23", "v24", "v25" 1892 ); 1893} 1894#endif // HAS_RGB24TOUVROW_NEON 1895 1896#ifdef HAS_RAWTOUVROW_NEON 1897void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1898 uint8* dst_u, uint8* dst_v, int pix) { 1899 const uint8* src_raw_1 = src_raw + src_stride_raw; 1900 asm volatile ( 1901 RGBTOUV_SETUP_REG 1902 "1: \n" 1903 MEMACCESS(0) 1904 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1905 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1906 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1907 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1908 MEMACCESS(1) 1909 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels 1910 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1911 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1912 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1913 1914 "urshr v2.8h, v2.8h, #1 \n" // 2x average 1915 "urshr v1.8h, v1.8h, #1 \n" 1916 "urshr v0.8h, v0.8h, #1 \n" 1917 1918 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1919 RGBTOUV(v2.8h, v1.8h, v0.8h) 1920 MEMACCESS(2) 1921 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1922 MEMACCESS(3) 1923 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1924 "b.gt 1b \n" 1925 : "+r"(src_raw), // %0 1926 "+r"(src_raw_1), // %1 1927 "+r"(dst_u), // %2 1928 "+r"(dst_v), // %3 1929 "+r"(pix) // %4 1930 : 1931 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1932 "v20", "v21", "v22", "v23", "v24", "v25" 1933 ); 1934} 1935#endif // HAS_RAWTOUVROW_NEON 1936 1937// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 1938#ifdef HAS_RGB565TOUVROW_NEON 1939void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1940 uint8* dst_u, uint8* dst_v, int pix) { 1941 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1942 asm volatile ( 1943 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1944 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1945 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1946 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1947 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1948 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1949 "1: \n" 1950 MEMACCESS(0) 1951 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1952 RGB565TOARGB 1953 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1954 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1955 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1956 MEMACCESS(0) 1957 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. 1958 RGB565TOARGB 1959 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1960 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1961 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1962 1963 MEMACCESS(1) 1964 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. 1965 RGB565TOARGB 1966 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1967 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1968 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1969 MEMACCESS(1) 1970 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. 1971 RGB565TOARGB 1972 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1973 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1974 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1975 1976 "ins v16.D[1], v17.D[0] \n" 1977 "ins v18.D[1], v19.D[0] \n" 1978 "ins v20.D[1], v21.D[0] \n" 1979 1980 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1981 "urshr v5.8h, v18.8h, #1 \n" 1982 "urshr v6.8h, v20.8h, #1 \n" 1983 1984 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1985 "mul v16.8h, v4.8h, v22.8h \n" // B 1986 "mls v16.8h, v5.8h, v23.8h \n" // G 1987 "mls v16.8h, v6.8h, v24.8h \n" // R 1988 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1989 "mul v17.8h, v6.8h, v22.8h \n" // R 1990 "mls v17.8h, v5.8h, v26.8h \n" // G 1991 "mls v17.8h, v4.8h, v25.8h \n" // B 1992 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1993 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1994 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1995 MEMACCESS(2) 1996 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1997 MEMACCESS(3) 1998 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1999 "b.gt 1b \n" 2000 : "+r"(src_rgb565), // %0 2001 "+r"(src_rgb565_1), // %1 2002 "+r"(dst_u), // %2 2003 "+r"(dst_v), // %3 2004 "+r"(pix) // %4 2005 : 2006 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2007 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 2008 "v25", "v26", "v27" 2009 ); 2010} 2011#endif // HAS_RGB565TOUVROW_NEON 2012 2013// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2014#ifdef HAS_ARGB1555TOUVROW_NEON 2015void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 2016 uint8* dst_u, uint8* dst_v, int pix) { 2017 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 2018 asm volatile ( 2019 RGBTOUV_SETUP_REG 2020 "1: \n" 2021 MEMACCESS(0) 2022 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2023 RGB555TOARGB 2024 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2025 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2026 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2027 MEMACCESS(0) 2028 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. 2029 RGB555TOARGB 2030 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2031 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2032 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2033 2034 MEMACCESS(1) 2035 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. 2036 RGB555TOARGB 2037 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2038 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2039 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2040 MEMACCESS(1) 2041 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. 2042 RGB555TOARGB 2043 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2044 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2045 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2046 2047 "ins v16.D[1], v26.D[0] \n" 2048 "ins v17.D[1], v27.D[0] \n" 2049 "ins v18.D[1], v28.D[0] \n" 2050 2051 "urshr v4.8h, v16.8h, #1 \n" // 2x average 2052 "urshr v5.8h, v17.8h, #1 \n" 2053 "urshr v6.8h, v18.8h, #1 \n" 2054 2055 "subs %w4, %w4, #16 \n" // 16 processed per loop. 2056 "mul v2.8h, v4.8h, v20.8h \n" // B 2057 "mls v2.8h, v5.8h, v21.8h \n" // G 2058 "mls v2.8h, v6.8h, v22.8h \n" // R 2059 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 2060 "mul v3.8h, v6.8h, v20.8h \n" // R 2061 "mls v3.8h, v5.8h, v24.8h \n" // G 2062 "mls v3.8h, v4.8h, v23.8h \n" // B 2063 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 2064 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 2065 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2066 MEMACCESS(2) 2067 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 2068 MEMACCESS(3) 2069 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 2070 "b.gt 1b \n" 2071 : "+r"(src_argb1555), // %0 2072 "+r"(src_argb1555_1), // %1 2073 "+r"(dst_u), // %2 2074 "+r"(dst_v), // %3 2075 "+r"(pix) // %4 2076 : 2077 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 2078 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 2079 "v26", "v27", "v28" 2080 ); 2081} 2082#endif // HAS_ARGB1555TOUVROW_NEON 2083 2084// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. 2085#ifdef HAS_ARGB4444TOUVROW_NEON 2086void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 2087 uint8* dst_u, uint8* dst_v, int pix) { 2088 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 2089 asm volatile ( 2090 RGBTOUV_SETUP_REG 2091 "1: \n" 2092 MEMACCESS(0) 2093 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2094 ARGB4444TOARGB 2095 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2096 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2097 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2098 MEMACCESS(0) 2099 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. 2100 ARGB4444TOARGB 2101 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2102 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2103 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2104 2105 MEMACCESS(1) 2106 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. 2107 ARGB4444TOARGB 2108 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2109 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2110 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2111 MEMACCESS(1) 2112 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. 2113 ARGB4444TOARGB 2114 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 2115 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 2116 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 2117 2118 "ins v16.D[1], v26.D[0] \n" 2119 "ins v17.D[1], v27.D[0] \n" 2120 "ins v18.D[1], v28.D[0] \n" 2121 2122 "urshr v4.8h, v16.8h, #1 \n" // 2x average 2123 "urshr v5.8h, v17.8h, #1 \n" 2124 "urshr v6.8h, v18.8h, #1 \n" 2125 2126 "subs %w4, %w4, #16 \n" // 16 processed per loop. 2127 "mul v2.8h, v4.8h, v20.8h \n" // B 2128 "mls v2.8h, v5.8h, v21.8h \n" // G 2129 "mls v2.8h, v6.8h, v22.8h \n" // R 2130 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 2131 "mul v3.8h, v6.8h, v20.8h \n" // R 2132 "mls v3.8h, v5.8h, v24.8h \n" // G 2133 "mls v3.8h, v4.8h, v23.8h \n" // B 2134 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 2135 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 2136 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 2137 MEMACCESS(2) 2138 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 2139 MEMACCESS(3) 2140 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 2141 "b.gt 1b \n" 2142 : "+r"(src_argb4444), // %0 2143 "+r"(src_argb4444_1), // %1 2144 "+r"(dst_u), // %2 2145 "+r"(dst_v), // %3 2146 "+r"(pix) // %4 2147 : 2148 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 2149 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 2150 "v26", "v27", "v28" 2151 2152 ); 2153} 2154#endif // HAS_ARGB4444TOUVROW_NEON 2155 2156#ifdef HAS_RGB565TOYROW_NEON 2157void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { 2158 asm volatile ( 2159 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2160 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2161 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2162 "movi v27.8b, #16 \n" // Add 16 constant 2163 "1: \n" 2164 MEMACCESS(0) 2165 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 2166 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2167 RGB565TOARGB 2168 "umull v3.8h, v0.8b, v24.8b \n" // B 2169 "umlal v3.8h, v1.8b, v25.8b \n" // G 2170 "umlal v3.8h, v2.8b, v26.8b \n" // R 2171 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2172 "uqadd v0.8b, v0.8b, v27.8b \n" 2173 MEMACCESS(1) 2174 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2175 "b.gt 1b \n" 2176 : "+r"(src_rgb565), // %0 2177 "+r"(dst_y), // %1 2178 "+r"(pix) // %2 2179 : 2180 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 2181 "v24", "v25", "v26", "v27" 2182 ); 2183} 2184#endif // HAS_RGB565TOYROW_NEON 2185 2186#ifdef HAS_ARGB1555TOYROW_NEON 2187void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { 2188 asm volatile ( 2189 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2190 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2191 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2192 "movi v7.8b, #16 \n" // Add 16 constant 2193 "1: \n" 2194 MEMACCESS(0) 2195 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 2196 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2197 ARGB1555TOARGB 2198 "umull v3.8h, v0.8b, v4.8b \n" // B 2199 "umlal v3.8h, v1.8b, v5.8b \n" // G 2200 "umlal v3.8h, v2.8b, v6.8b \n" // R 2201 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2202 "uqadd v0.8b, v0.8b, v7.8b \n" 2203 MEMACCESS(1) 2204 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2205 "b.gt 1b \n" 2206 : "+r"(src_argb1555), // %0 2207 "+r"(dst_y), // %1 2208 "+r"(pix) // %2 2209 : 2210 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2211 ); 2212} 2213#endif // HAS_ARGB1555TOYROW_NEON 2214 2215#ifdef HAS_ARGB4444TOYROW_NEON 2216void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { 2217 asm volatile ( 2218 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2219 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2220 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2221 "movi v27.8b, #16 \n" // Add 16 constant 2222 "1: \n" 2223 MEMACCESS(0) 2224 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2225 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2226 ARGB4444TOARGB 2227 "umull v3.8h, v0.8b, v24.8b \n" // B 2228 "umlal v3.8h, v1.8b, v25.8b \n" // G 2229 "umlal v3.8h, v2.8b, v26.8b \n" // R 2230 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2231 "uqadd v0.8b, v0.8b, v27.8b \n" 2232 MEMACCESS(1) 2233 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2234 "b.gt 1b \n" 2235 : "+r"(src_argb4444), // %0 2236 "+r"(dst_y), // %1 2237 "+r"(pix) // %2 2238 : 2239 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2240 ); 2241} 2242#endif // HAS_ARGB4444TOYROW_NEON 2243 2244#ifdef HAS_BGRATOYROW_NEON 2245void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { 2246 asm volatile ( 2247 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2248 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2249 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2250 "movi v7.8b, #16 \n" // Add 16 constant 2251 "1: \n" 2252 MEMACCESS(0) 2253 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2254 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2255 "umull v16.8h, v1.8b, v4.8b \n" // R 2256 "umlal v16.8h, v2.8b, v5.8b \n" // G 2257 "umlal v16.8h, v3.8b, v6.8b \n" // B 2258 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2259 "uqadd v0.8b, v0.8b, v7.8b \n" 2260 MEMACCESS(1) 2261 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2262 "b.gt 1b \n" 2263 : "+r"(src_bgra), // %0 2264 "+r"(dst_y), // %1 2265 "+r"(pix) // %2 2266 : 2267 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2268 ); 2269} 2270#endif // HAS_BGRATOYROW_NEON 2271 2272#ifdef HAS_ABGRTOYROW_NEON 2273void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { 2274 asm volatile ( 2275 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2276 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2277 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2278 "movi v7.8b, #16 \n" // Add 16 constant 2279 "1: \n" 2280 MEMACCESS(0) 2281 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2282 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2283 "umull v16.8h, v0.8b, v4.8b \n" // R 2284 "umlal v16.8h, v1.8b, v5.8b \n" // G 2285 "umlal v16.8h, v2.8b, v6.8b \n" // B 2286 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2287 "uqadd v0.8b, v0.8b, v7.8b \n" 2288 MEMACCESS(1) 2289 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2290 "b.gt 1b \n" 2291 : "+r"(src_abgr), // %0 2292 "+r"(dst_y), // %1 2293 "+r"(pix) // %2 2294 : 2295 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2296 ); 2297} 2298#endif // HAS_ABGRTOYROW_NEON 2299 2300#ifdef HAS_RGBATOYROW_NEON 2301void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { 2302 asm volatile ( 2303 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2304 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2305 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2306 "movi v7.8b, #16 \n" // Add 16 constant 2307 "1: \n" 2308 MEMACCESS(0) 2309 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2310 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2311 "umull v16.8h, v1.8b, v4.8b \n" // B 2312 "umlal v16.8h, v2.8b, v5.8b \n" // G 2313 "umlal v16.8h, v3.8b, v6.8b \n" // R 2314 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2315 "uqadd v0.8b, v0.8b, v7.8b \n" 2316 MEMACCESS(1) 2317 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2318 "b.gt 1b \n" 2319 : "+r"(src_rgba), // %0 2320 "+r"(dst_y), // %1 2321 "+r"(pix) // %2 2322 : 2323 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2324 ); 2325} 2326#endif // HAS_RGBATOYROW_NEON 2327 2328#ifdef HAS_RGB24TOYROW_NEON 2329void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { 2330 asm volatile ( 2331 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2332 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2333 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2334 "movi v7.8b, #16 \n" // Add 16 constant 2335 "1: \n" 2336 MEMACCESS(0) 2337 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2338 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2339 "umull v16.8h, v0.8b, v4.8b \n" // B 2340 "umlal v16.8h, v1.8b, v5.8b \n" // G 2341 "umlal v16.8h, v2.8b, v6.8b \n" // R 2342 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2343 "uqadd v0.8b, v0.8b, v7.8b \n" 2344 MEMACCESS(1) 2345 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2346 "b.gt 1b \n" 2347 : "+r"(src_rgb24), // %0 2348 "+r"(dst_y), // %1 2349 "+r"(pix) // %2 2350 : 2351 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2352 ); 2353} 2354#endif // HAS_RGB24TOYROW_NEON 2355 2356#ifdef HAS_RAWTOYROW_NEON 2357void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { 2358 asm volatile ( 2359 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2360 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2361 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2362 "movi v7.8b, #16 \n" // Add 16 constant 2363 "1: \n" 2364 MEMACCESS(0) 2365 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2366 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2367 "umull v16.8h, v0.8b, v4.8b \n" // B 2368 "umlal v16.8h, v1.8b, v5.8b \n" // G 2369 "umlal v16.8h, v2.8b, v6.8b \n" // R 2370 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2371 "uqadd v0.8b, v0.8b, v7.8b \n" 2372 MEMACCESS(1) 2373 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2374 "b.gt 1b \n" 2375 : "+r"(src_raw), // %0 2376 "+r"(dst_y), // %1 2377 "+r"(pix) // %2 2378 : 2379 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2380 ); 2381} 2382#endif // HAS_RAWTOYROW_NEON 2383 2384// Bilinear filter 16x2 -> 16x1 2385#ifdef HAS_INTERPOLATEROW_NEON 2386void InterpolateRow_NEON(uint8* dst_ptr, 2387 const uint8* src_ptr, ptrdiff_t src_stride, 2388 int dst_width, int source_y_fraction) { 2389 int y1_fraction = source_y_fraction; 2390 int y0_fraction = 256 - y1_fraction; 2391 const uint8* src_ptr1 = src_ptr + src_stride; 2392 asm volatile ( 2393 "cmp %w4, #0 \n" 2394 "b.eq 100f \n" 2395 "cmp %w4, #64 \n" 2396 "b.eq 75f \n" 2397 "cmp %w4, #128 \n" 2398 "b.eq 50f \n" 2399 "cmp %w4, #192 \n" 2400 "b.eq 25f \n" 2401 2402 "dup v5.16b, %w4 \n" 2403 "dup v4.16b, %w5 \n" 2404 // General purpose row blend. 2405 "1: \n" 2406 MEMACCESS(1) 2407 "ld1 {v0.16b}, [%1], #16 \n" 2408 MEMACCESS(2) 2409 "ld1 {v1.16b}, [%2], #16 \n" 2410 "subs %w3, %w3, #16 \n" 2411 "umull v2.8h, v0.8b, v4.8b \n" 2412 "umull2 v3.8h, v0.16b, v4.16b \n" 2413 "umlal v2.8h, v1.8b, v5.8b \n" 2414 "umlal2 v3.8h, v1.16b, v5.16b \n" 2415 "rshrn v0.8b, v2.8h, #8 \n" 2416 "rshrn2 v0.16b, v3.8h, #8 \n" 2417 MEMACCESS(0) 2418 "st1 {v0.16b}, [%0], #16 \n" 2419 "b.gt 1b \n" 2420 "b 99f \n" 2421 2422 // Blend 25 / 75. 2423 "25: \n" 2424 MEMACCESS(1) 2425 "ld1 {v0.16b}, [%1], #16 \n" 2426 MEMACCESS(2) 2427 "ld1 {v1.16b}, [%2], #16 \n" 2428 "subs %w3, %w3, #16 \n" 2429 "urhadd v0.16b, v0.16b, v1.16b \n" 2430 "urhadd v0.16b, v0.16b, v1.16b \n" 2431 MEMACCESS(0) 2432 "st1 {v0.16b}, [%0], #16 \n" 2433 "b.gt 25b \n" 2434 "b 99f \n" 2435 2436 // Blend 50 / 50. 2437 "50: \n" 2438 MEMACCESS(1) 2439 "ld1 {v0.16b}, [%1], #16 \n" 2440 MEMACCESS(2) 2441 "ld1 {v1.16b}, [%2], #16 \n" 2442 "subs %w3, %w3, #16 \n" 2443 "urhadd v0.16b, v0.16b, v1.16b \n" 2444 MEMACCESS(0) 2445 "st1 {v0.16b}, [%0], #16 \n" 2446 "b.gt 50b \n" 2447 "b 99f \n" 2448 2449 // Blend 75 / 25. 2450 "75: \n" 2451 MEMACCESS(1) 2452 "ld1 {v1.16b}, [%1], #16 \n" 2453 MEMACCESS(2) 2454 "ld1 {v0.16b}, [%2], #16 \n" 2455 "subs %w3, %w3, #16 \n" 2456 "urhadd v0.16b, v0.16b, v1.16b \n" 2457 "urhadd v0.16b, v0.16b, v1.16b \n" 2458 MEMACCESS(0) 2459 "st1 {v0.16b}, [%0], #16 \n" 2460 "b.gt 75b \n" 2461 "b 99f \n" 2462 2463 // Blend 100 / 0 - Copy row unchanged. 2464 "100: \n" 2465 MEMACCESS(1) 2466 "ld1 {v0.16b}, [%1], #16 \n" 2467 "subs %w3, %w3, #16 \n" 2468 MEMACCESS(0) 2469 "st1 {v0.16b}, [%0], #16 \n" 2470 "b.gt 100b \n" 2471 2472 "99: \n" 2473 : "+r"(dst_ptr), // %0 2474 "+r"(src_ptr), // %1 2475 "+r"(src_ptr1), // %2 2476 "+r"(dst_width), // %3 2477 "+r"(y1_fraction), // %4 2478 "+r"(y0_fraction) // %5 2479 : 2480 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2481 ); 2482} 2483#endif // HAS_INTERPOLATEROW_NEON 2484 2485// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2486#ifdef HAS_ARGBBLENDROW_NEON 2487void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2488 uint8* dst_argb, int width) { 2489 asm volatile ( 2490 "subs %w3, %w3, #8 \n" 2491 "b.lt 89f \n" 2492 // Blend 8 pixels. 2493 "8: \n" 2494 MEMACCESS(0) 2495 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2496 MEMACCESS(1) 2497 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels 2498 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2499 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2500 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2501 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2502 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2503 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2504 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2505 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2506 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2507 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2508 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2509 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2510 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2511 "movi v3.8b, #255 \n" // a = 255 2512 MEMACCESS(2) 2513 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2514 "b.ge 8b \n" 2515 2516 "89: \n" 2517 "adds %w3, %w3, #8-1 \n" 2518 "b.lt 99f \n" 2519 2520 // Blend 1 pixels. 2521 "1: \n" 2522 MEMACCESS(0) 2523 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2524 MEMACCESS(1) 2525 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2526 "subs %w3, %w3, #1 \n" // 1 processed per loop. 2527 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2528 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2529 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2530 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2531 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2532 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2533 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2534 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2535 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2536 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2537 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2538 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2539 "movi v3.8b, #255 \n" // a = 255 2540 MEMACCESS(2) 2541 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. 2542 "b.ge 1b \n" 2543 2544 "99: \n" 2545 2546 : "+r"(src_argb0), // %0 2547 "+r"(src_argb1), // %1 2548 "+r"(dst_argb), // %2 2549 "+r"(width) // %3 2550 : 2551 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2552 "v16", "v17", "v18" 2553 ); 2554} 2555#endif // HAS_ARGBBLENDROW_NEON 2556 2557// Attenuate 8 pixels at a time. 2558#ifdef HAS_ARGBATTENUATEROW_NEON 2559void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2560 asm volatile ( 2561 // Attenuate 8 pixels. 2562 "1: \n" 2563 MEMACCESS(0) 2564 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2565 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2566 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2567 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2568 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2569 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2570 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2571 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2572 MEMACCESS(1) 2573 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2574 "b.gt 1b \n" 2575 : "+r"(src_argb), // %0 2576 "+r"(dst_argb), // %1 2577 "+r"(width) // %2 2578 : 2579 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2580 ); 2581} 2582#endif // HAS_ARGBATTENUATEROW_NEON 2583 2584// Quantize 8 ARGB pixels (32 bytes). 2585// dst = (dst * scale >> 16) * interval_size + interval_offset; 2586#ifdef HAS_ARGBQUANTIZEROW_NEON 2587void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2588 int interval_offset, int width) { 2589 asm volatile ( 2590 "dup v4.8h, %w2 \n" 2591 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2592 "dup v5.8h, %w3 \n" // interval multiply. 2593 "dup v6.8h, %w4 \n" // interval add 2594 2595 // 8 pixel loop. 2596 "1: \n" 2597 MEMACCESS(0) 2598 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. 2599 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2600 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2601 "uxtl v1.8h, v1.8b \n" 2602 "uxtl v2.8h, v2.8b \n" 2603 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2604 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2605 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2606 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2607 "mul v1.8h, v1.8h, v5.8h \n" // g 2608 "mul v2.8h, v2.8h, v5.8h \n" // r 2609 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2610 "add v1.8h, v1.8h, v6.8h \n" // g 2611 "add v2.8h, v2.8h, v6.8h \n" // r 2612 "uqxtn v0.8b, v0.8h \n" 2613 "uqxtn v1.8b, v1.8h \n" 2614 "uqxtn v2.8b, v2.8h \n" 2615 MEMACCESS(0) 2616 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels 2617 "b.gt 1b \n" 2618 : "+r"(dst_argb), // %0 2619 "+r"(width) // %1 2620 : "r"(scale), // %2 2621 "r"(interval_size), // %3 2622 "r"(interval_offset) // %4 2623 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2624 ); 2625} 2626#endif // HAS_ARGBQUANTIZEROW_NEON 2627 2628// Shade 8 pixels at a time by specified value. 2629// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2630// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2631#ifdef HAS_ARGBSHADEROW_NEON 2632void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2633 uint32 value) { 2634 asm volatile ( 2635 "dup v0.4s, %w3 \n" // duplicate scale value. 2636 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2637 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2638 2639 // 8 pixel loop. 2640 "1: \n" 2641 MEMACCESS(0) 2642 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2643 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2644 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2645 "uxtl v5.8h, v5.8b \n" 2646 "uxtl v6.8h, v6.8b \n" 2647 "uxtl v7.8h, v7.8b \n" 2648 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2649 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2650 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2651 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2652 "uqxtn v4.8b, v4.8h \n" 2653 "uqxtn v5.8b, v5.8h \n" 2654 "uqxtn v6.8b, v6.8h \n" 2655 "uqxtn v7.8b, v7.8h \n" 2656 MEMACCESS(1) 2657 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels 2658 "b.gt 1b \n" 2659 : "+r"(src_argb), // %0 2660 "+r"(dst_argb), // %1 2661 "+r"(width) // %2 2662 : "r"(value) // %3 2663 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" 2664 ); 2665} 2666#endif // HAS_ARGBSHADEROW_NEON 2667 2668// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2669// Similar to ARGBToYJ but stores ARGB. 2670// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2671#ifdef HAS_ARGBGRAYROW_NEON 2672void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2673 asm volatile ( 2674 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2675 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2676 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2677 "1: \n" 2678 MEMACCESS(0) 2679 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2680 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2681 "umull v4.8h, v0.8b, v24.8b \n" // B 2682 "umlal v4.8h, v1.8b, v25.8b \n" // G 2683 "umlal v4.8h, v2.8b, v26.8b \n" // R 2684 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2685 "orr v1.8b, v0.8b, v0.8b \n" // G 2686 "orr v2.8b, v0.8b, v0.8b \n" // R 2687 MEMACCESS(1) 2688 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2689 "b.gt 1b \n" 2690 : "+r"(src_argb), // %0 2691 "+r"(dst_argb), // %1 2692 "+r"(width) // %2 2693 : 2694 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" 2695 ); 2696} 2697#endif // HAS_ARGBGRAYROW_NEON 2698 2699// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2700// b = (r * 35 + g * 68 + b * 17) >> 7 2701// g = (r * 45 + g * 88 + b * 22) >> 7 2702// r = (r * 50 + g * 98 + b * 24) >> 7 2703 2704#ifdef HAS_ARGBSEPIAROW_NEON 2705void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2706 asm volatile ( 2707 "movi v20.8b, #17 \n" // BB coefficient 2708 "movi v21.8b, #68 \n" // BG coefficient 2709 "movi v22.8b, #35 \n" // BR coefficient 2710 "movi v24.8b, #22 \n" // GB coefficient 2711 "movi v25.8b, #88 \n" // GG coefficient 2712 "movi v26.8b, #45 \n" // GR coefficient 2713 "movi v28.8b, #24 \n" // BB coefficient 2714 "movi v29.8b, #98 \n" // BG coefficient 2715 "movi v30.8b, #50 \n" // BR coefficient 2716 "1: \n" 2717 MEMACCESS(0) 2718 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2719 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2720 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2721 "umlal v4.8h, v1.8b, v21.8b \n" // G 2722 "umlal v4.8h, v2.8b, v22.8b \n" // R 2723 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2724 "umlal v5.8h, v1.8b, v25.8b \n" // G 2725 "umlal v5.8h, v2.8b, v26.8b \n" // R 2726 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2727 "umlal v6.8h, v1.8b, v29.8b \n" // G 2728 "umlal v6.8h, v2.8b, v30.8b \n" // R 2729 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2730 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G 2731 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R 2732 MEMACCESS(0) 2733 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2734 "b.gt 1b \n" 2735 : "+r"(dst_argb), // %0 2736 "+r"(width) // %1 2737 : 2738 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2739 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" 2740 ); 2741} 2742#endif // HAS_ARGBSEPIAROW_NEON 2743 2744// Tranform 8 ARGB pixels (32 bytes) with color matrix. 2745// TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2746// needs to saturate. Consider doing a non-saturating version. 2747#ifdef HAS_ARGBCOLORMATRIXROW_NEON 2748void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, 2749 const int8* matrix_argb, int width) { 2750 asm volatile ( 2751 MEMACCESS(3) 2752 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2753 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2754 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2755 2756 "1: \n" 2757 MEMACCESS(0) 2758 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. 2759 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2760 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit 2761 "uxtl v17.8h, v17.8b \n" // g 2762 "uxtl v18.8h, v18.8b \n" // r 2763 "uxtl v19.8h, v19.8b \n" // a 2764 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B 2765 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G 2766 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R 2767 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A 2768 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B 2769 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G 2770 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R 2771 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A 2772 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2773 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2774 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2775 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2776 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B 2777 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G 2778 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R 2779 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A 2780 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2781 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2782 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2783 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2784 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B 2785 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G 2786 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R 2787 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A 2788 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2789 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2790 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2791 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2792 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B 2793 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G 2794 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2795 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2796 MEMACCESS(1) 2797 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. 2798 "b.gt 1b \n" 2799 : "+r"(src_argb), // %0 2800 "+r"(dst_argb), // %1 2801 "+r"(width) // %2 2802 : "r"(matrix_argb) // %3 2803 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 2804 "v18", "v19", "v22", "v23", "v24", "v25" 2805 ); 2806} 2807#endif // HAS_ARGBCOLORMATRIXROW_NEON 2808 2809// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2810// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2811#ifdef HAS_ARGBMULTIPLYROW_NEON 2812void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2813 uint8* dst_argb, int width) { 2814 asm volatile ( 2815 // 8 pixel loop. 2816 "1: \n" 2817 MEMACCESS(0) 2818 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2819 MEMACCESS(1) 2820 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2821 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2822 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2823 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2824 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2825 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2826 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2827 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2828 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2829 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2830 MEMACCESS(2) 2831 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2832 "b.gt 1b \n" 2833 2834 : "+r"(src_argb0), // %0 2835 "+r"(src_argb1), // %1 2836 "+r"(dst_argb), // %2 2837 "+r"(width) // %3 2838 : 2839 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2840 ); 2841} 2842#endif // HAS_ARGBMULTIPLYROW_NEON 2843 2844// Add 2 rows of ARGB pixels together, 8 pixels at a time. 2845#ifdef HAS_ARGBADDROW_NEON 2846void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2847 uint8* dst_argb, int width) { 2848 asm volatile ( 2849 // 8 pixel loop. 2850 "1: \n" 2851 MEMACCESS(0) 2852 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2853 MEMACCESS(1) 2854 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2855 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2856 "uqadd v0.8b, v0.8b, v4.8b \n" 2857 "uqadd v1.8b, v1.8b, v5.8b \n" 2858 "uqadd v2.8b, v2.8b, v6.8b \n" 2859 "uqadd v3.8b, v3.8b, v7.8b \n" 2860 MEMACCESS(2) 2861 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2862 "b.gt 1b \n" 2863 2864 : "+r"(src_argb0), // %0 2865 "+r"(src_argb1), // %1 2866 "+r"(dst_argb), // %2 2867 "+r"(width) // %3 2868 : 2869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2870 ); 2871} 2872#endif // HAS_ARGBADDROW_NEON 2873 2874// Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2875#ifdef HAS_ARGBSUBTRACTROW_NEON 2876void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2877 uint8* dst_argb, int width) { 2878 asm volatile ( 2879 // 8 pixel loop. 2880 "1: \n" 2881 MEMACCESS(0) 2882 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2883 MEMACCESS(1) 2884 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2885 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2886 "uqsub v0.8b, v0.8b, v4.8b \n" 2887 "uqsub v1.8b, v1.8b, v5.8b \n" 2888 "uqsub v2.8b, v2.8b, v6.8b \n" 2889 "uqsub v3.8b, v3.8b, v7.8b \n" 2890 MEMACCESS(2) 2891 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2892 "b.gt 1b \n" 2893 2894 : "+r"(src_argb0), // %0 2895 "+r"(src_argb1), // %1 2896 "+r"(dst_argb), // %2 2897 "+r"(width) // %3 2898 : 2899 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2900 ); 2901} 2902#endif // HAS_ARGBSUBTRACTROW_NEON 2903 2904// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 2905// A = 255 2906// R = Sobel 2907// G = Sobel 2908// B = Sobel 2909#ifdef HAS_SOBELROW_NEON 2910void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2911 uint8* dst_argb, int width) { 2912 asm volatile ( 2913 "movi v3.8b, #255 \n" // alpha 2914 // 8 pixel loop. 2915 "1: \n" 2916 MEMACCESS(0) 2917 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2918 MEMACCESS(1) 2919 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2920 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2921 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2922 "orr v1.8b, v0.8b, v0.8b \n" 2923 "orr v2.8b, v0.8b, v0.8b \n" 2924 MEMACCESS(2) 2925 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2926 "b.gt 1b \n" 2927 : "+r"(src_sobelx), // %0 2928 "+r"(src_sobely), // %1 2929 "+r"(dst_argb), // %2 2930 "+r"(width) // %3 2931 : 2932 : "cc", "memory", "v0", "v1", "v2", "v3" 2933 ); 2934} 2935#endif // HAS_SOBELROW_NEON 2936 2937// Adds Sobel X and Sobel Y and stores Sobel into plane. 2938#ifdef HAS_SOBELTOPLANEROW_NEON 2939void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2940 uint8* dst_y, int width) { 2941 asm volatile ( 2942 // 16 pixel loop. 2943 "1: \n" 2944 MEMACCESS(0) 2945 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2946 MEMACCESS(1) 2947 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2948 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2949 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2950 MEMACCESS(2) 2951 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2952 "b.gt 1b \n" 2953 : "+r"(src_sobelx), // %0 2954 "+r"(src_sobely), // %1 2955 "+r"(dst_y), // %2 2956 "+r"(width) // %3 2957 : 2958 : "cc", "memory", "v0", "v1" 2959 ); 2960} 2961#endif // HAS_SOBELTOPLANEROW_NEON 2962 2963// Mixes Sobel X, Sobel Y and Sobel into ARGB. 2964// A = 255 2965// R = Sobel X 2966// G = Sobel 2967// B = Sobel Y 2968#ifdef HAS_SOBELXYROW_NEON 2969void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2970 uint8* dst_argb, int width) { 2971 asm volatile ( 2972 "movi v3.8b, #255 \n" // alpha 2973 // 8 pixel loop. 2974 "1: \n" 2975 MEMACCESS(0) 2976 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2977 MEMACCESS(1) 2978 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2979 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2980 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2981 MEMACCESS(2) 2982 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2983 "b.gt 1b \n" 2984 : "+r"(src_sobelx), // %0 2985 "+r"(src_sobely), // %1 2986 "+r"(dst_argb), // %2 2987 "+r"(width) // %3 2988 : 2989 : "cc", "memory", "v0", "v1", "v2", "v3" 2990 ); 2991} 2992#endif // HAS_SOBELXYROW_NEON 2993 2994// SobelX as a matrix is 2995// -1 0 1 2996// -2 0 2 2997// -1 0 1 2998#ifdef HAS_SOBELXROW_NEON 2999void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 3000 const uint8* src_y2, uint8* dst_sobelx, int width) { 3001 asm volatile ( 3002 "1: \n" 3003 MEMACCESS(0) 3004 "ld1 {v0.8b}, [%0],%5 \n" // top 3005 MEMACCESS(0) 3006 "ld1 {v1.8b}, [%0],%6 \n" 3007 "usubl v0.8h, v0.8b, v1.8b \n" 3008 MEMACCESS(1) 3009 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 3010 MEMACCESS(1) 3011 "ld1 {v3.8b}, [%1],%6 \n" 3012 "usubl v1.8h, v2.8b, v3.8b \n" 3013 "add v0.8h, v0.8h, v1.8h \n" 3014 "add v0.8h, v0.8h, v1.8h \n" 3015 MEMACCESS(2) 3016 "ld1 {v2.8b}, [%2],%5 \n" // bottom 3017 MEMACCESS(2) 3018 "ld1 {v3.8b}, [%2],%6 \n" 3019 "subs %w4, %w4, #8 \n" // 8 pixels 3020 "usubl v1.8h, v2.8b, v3.8b \n" 3021 "add v0.8h, v0.8h, v1.8h \n" 3022 "abs v0.8h, v0.8h \n" 3023 "uqxtn v0.8b, v0.8h \n" 3024 MEMACCESS(3) 3025 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 3026 "b.gt 1b \n" 3027 : "+r"(src_y0), // %0 3028 "+r"(src_y1), // %1 3029 "+r"(src_y2), // %2 3030 "+r"(dst_sobelx), // %3 3031 "+r"(width) // %4 3032 : "r"(2LL), // %5 3033 "r"(6LL) // %6 3034 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3035 ); 3036} 3037#endif // HAS_SOBELXROW_NEON 3038 3039// SobelY as a matrix is 3040// -1 -2 -1 3041// 0 0 0 3042// 1 2 1 3043#ifdef HAS_SOBELYROW_NEON 3044void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 3045 uint8* dst_sobely, int width) { 3046 asm volatile ( 3047 "1: \n" 3048 MEMACCESS(0) 3049 "ld1 {v0.8b}, [%0],%4 \n" // left 3050 MEMACCESS(1) 3051 "ld1 {v1.8b}, [%1],%4 \n" 3052 "usubl v0.8h, v0.8b, v1.8b \n" 3053 MEMACCESS(0) 3054 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 3055 MEMACCESS(1) 3056 "ld1 {v3.8b}, [%1],%4 \n" 3057 "usubl v1.8h, v2.8b, v3.8b \n" 3058 "add v0.8h, v0.8h, v1.8h \n" 3059 "add v0.8h, v0.8h, v1.8h \n" 3060 MEMACCESS(0) 3061 "ld1 {v2.8b}, [%0],%5 \n" // right 3062 MEMACCESS(1) 3063 "ld1 {v3.8b}, [%1],%5 \n" 3064 "subs %w3, %w3, #8 \n" // 8 pixels 3065 "usubl v1.8h, v2.8b, v3.8b \n" 3066 "add v0.8h, v0.8h, v1.8h \n" 3067 "abs v0.8h, v0.8h \n" 3068 "uqxtn v0.8b, v0.8h \n" 3069 MEMACCESS(2) 3070 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 3071 "b.gt 1b \n" 3072 : "+r"(src_y0), // %0 3073 "+r"(src_y1), // %1 3074 "+r"(dst_sobely), // %2 3075 "+r"(width) // %3 3076 : "r"(1LL), // %4 3077 "r"(6LL) // %5 3078 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 3079 ); 3080} 3081#endif // HAS_SOBELYROW_NEON 3082#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 3083 3084#ifdef __cplusplus 3085} // extern "C" 3086} // namespace libyuv 3087#endif 3088