1/* 2 * Copyright 2014 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12 13#ifdef __cplusplus 14namespace libyuv { 15extern "C" { 16#endif 17 18// This module is for GCC Neon armv8 64 bit. 19#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 20 21// Read 8 Y, 4 U and 4 V from 422 22#define READYUV422 \ 23 MEMACCESS(0) \ 24 "ld1 {v0.8b}, [%0], #8 \n" \ 25 MEMACCESS(1) \ 26 "ld1 {v1.s}[0], [%1], #4 \n" \ 27 MEMACCESS(2) \ 28 "ld1 {v1.s}[1], [%2], #4 \n" 29 30// Read 8 Y, 2 U and 2 V from 422 31#define READYUV411 \ 32 MEMACCESS(0) \ 33 "ld1 {v0.8b}, [%0], #8 \n" \ 34 MEMACCESS(1) \ 35 "ld1 {v2.h}[0], [%1], #2 \n" \ 36 MEMACCESS(2) \ 37 "ld1 {v2.h}[1], [%2], #2 \n" \ 38 "zip1 v1.8b, v2.8b, v2.8b \n" 39 40// Read 8 Y, 8 U and 8 V from 444 41#define READYUV444 \ 42 MEMACCESS(0) \ 43 "ld1 {v0.8b}, [%0], #8 \n" \ 44 MEMACCESS(1) \ 45 "ld1 {v1.d}[0], [%1], #8 \n" \ 46 MEMACCESS(2) \ 47 "ld1 {v1.d}[1], [%2], #8 \n" \ 48 "uaddlp v1.8h, v1.16b \n" \ 49 "rshrn v1.8b, v1.8h, #1 \n" 50 51// Read 8 Y, and set 4 U and 4 V to 128 52#define READYUV400 \ 53 MEMACCESS(0) \ 54 "ld1 {v0.8b}, [%0], #8 \n" \ 55 "movi v1.8b , #128 \n" 56 57// Read 8 Y and 4 UV from NV12 58#define READNV12 \ 59 MEMACCESS(0) \ 60 "ld1 {v0.8b}, [%0], #8 \n" \ 61 MEMACCESS(1) \ 62 "ld1 {v2.8b}, [%1], #8 \n" \ 63 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 64 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 65 "ins v1.s[1], v3.s[0] \n" 66 67// Read 8 Y and 4 VU from NV21 68#define READNV21 \ 69 MEMACCESS(0) \ 70 "ld1 {v0.8b}, [%0], #8 \n" \ 71 MEMACCESS(1) \ 72 "ld1 {v2.8b}, [%1], #8 \n" \ 73 "uzp1 v3.8b, v2.8b, v2.8b \n" \ 74 "uzp2 v1.8b, v2.8b, v2.8b \n" \ 75 "ins v1.s[1], v3.s[0] \n" 76 77// Read 8 YUY2 78#define READYUY2 \ 79 MEMACCESS(0) \ 80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \ 81 "uzp2 v3.8b, v1.8b, v1.8b \n" \ 82 "uzp1 v1.8b, v1.8b, v1.8b \n" \ 83 "ins v1.s[1], v3.s[0] \n" 84 85// Read 8 UYVY 86#define READUYVY \ 87 MEMACCESS(0) \ 88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \ 89 "orr v0.8b, v3.8b, v3.8b \n" \ 90 "uzp1 v1.8b, v2.8b, v2.8b \n" \ 91 "uzp2 v3.8b, v2.8b, v2.8b \n" \ 92 "ins v1.s[1], v3.s[0] \n" 93 94#define YUVTORGB_SETUP \ 95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ 96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ 97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ 98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \ 99 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ 100 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" 101 102#define YUVTORGB(vR, vG, vB) \ 103 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ 104 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ 105 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ 106 "ushll v0.4s, v0.4h, #0 \n" \ 107 "mul v3.4s, v3.4s, v31.4s \n" \ 108 "mul v0.4s, v0.4s, v31.4s \n" \ 109 "sqshrun v0.4h, v0.4s, #16 \n" \ 110 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \ 111 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \ 112 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \ 113 "uxtl v2.8h, v2.8b \n" \ 114 "uxtl v1.8h, v1.8b \n" /* Extract U */ \ 115 "mul v3.8h, v1.8h, v27.8h \n" \ 116 "mul v5.8h, v1.8h, v29.8h \n" \ 117 "mul v6.8h, v2.8h, v30.8h \n" \ 118 "mul v7.8h, v2.8h, v28.8h \n" \ 119 "sqadd v6.8h, v6.8h, v5.8h \n" \ 120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \ 121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \ 122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \ 123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \ 124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \ 125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \ 126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \ 127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ 128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ 129 130void I444ToARGBRow_NEON(const uint8* src_y, 131 const uint8* src_u, 132 const uint8* src_v, 133 uint8* dst_argb, 134 const struct YuvConstants* yuvconstants, 135 int width) { 136 asm volatile ( 137 YUVTORGB_SETUP 138 "movi v23.8b, #255 \n" /* A */ 139 "1: \n" 140 READYUV444 141 YUVTORGB(v22, v21, v20) 142 "subs %w4, %w4, #8 \n" 143 MEMACCESS(3) 144 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 145 "b.gt 1b \n" 146 : "+r"(src_y), // %0 147 "+r"(src_u), // %1 148 "+r"(src_v), // %2 149 "+r"(dst_argb), // %3 150 "+r"(width) // %4 151 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 152 [kUVToG]"r"(&yuvconstants->kUVToG), 153 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 154 [kYToRgb]"r"(&yuvconstants->kYToRgb) 155 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 156 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 157 ); 158} 159 160void I422ToARGBRow_NEON(const uint8* src_y, 161 const uint8* src_u, 162 const uint8* src_v, 163 uint8* dst_argb, 164 const struct YuvConstants* yuvconstants, 165 int width) { 166 asm volatile ( 167 YUVTORGB_SETUP 168 "movi v23.8b, #255 \n" /* A */ 169 "1: \n" 170 READYUV422 171 YUVTORGB(v22, v21, v20) 172 "subs %w4, %w4, #8 \n" 173 MEMACCESS(3) 174 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 175 "b.gt 1b \n" 176 : "+r"(src_y), // %0 177 "+r"(src_u), // %1 178 "+r"(src_v), // %2 179 "+r"(dst_argb), // %3 180 "+r"(width) // %4 181 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 182 [kUVToG]"r"(&yuvconstants->kUVToG), 183 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 184 [kYToRgb]"r"(&yuvconstants->kYToRgb) 185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 186 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 187 ); 188} 189 190void I422AlphaToARGBRow_NEON(const uint8* src_y, 191 const uint8* src_u, 192 const uint8* src_v, 193 const uint8* src_a, 194 uint8* dst_argb, 195 const struct YuvConstants* yuvconstants, 196 int width) { 197 asm volatile ( 198 YUVTORGB_SETUP 199 "1: \n" 200 READYUV422 201 YUVTORGB(v22, v21, v20) 202 MEMACCESS(3) 203 "ld1 {v23.8b}, [%3], #8 \n" 204 "subs %w5, %w5, #8 \n" 205 MEMACCESS(4) 206 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" 207 "b.gt 1b \n" 208 : "+r"(src_y), // %0 209 "+r"(src_u), // %1 210 "+r"(src_v), // %2 211 "+r"(src_a), // %3 212 "+r"(dst_argb), // %4 213 "+r"(width) // %5 214 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 215 [kUVToG]"r"(&yuvconstants->kUVToG), 216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 217 [kYToRgb]"r"(&yuvconstants->kYToRgb) 218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 220 ); 221} 222 223void I411ToARGBRow_NEON(const uint8* src_y, 224 const uint8* src_u, 225 const uint8* src_v, 226 uint8* dst_argb, 227 const struct YuvConstants* yuvconstants, 228 int width) { 229 asm volatile ( 230 YUVTORGB_SETUP 231 "movi v23.8b, #255 \n" /* A */ 232 "1: \n" 233 READYUV411 234 YUVTORGB(v22, v21, v20) 235 "subs %w4, %w4, #8 \n" 236 MEMACCESS(3) 237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 238 "b.gt 1b \n" 239 : "+r"(src_y), // %0 240 "+r"(src_u), // %1 241 "+r"(src_v), // %2 242 "+r"(dst_argb), // %3 243 "+r"(width) // %4 244 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 245 [kUVToG]"r"(&yuvconstants->kUVToG), 246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 247 [kYToRgb]"r"(&yuvconstants->kYToRgb) 248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 250 ); 251} 252 253void I422ToRGBARow_NEON(const uint8* src_y, 254 const uint8* src_u, 255 const uint8* src_v, 256 uint8* dst_rgba, 257 const struct YuvConstants* yuvconstants, 258 int width) { 259 asm volatile ( 260 YUVTORGB_SETUP 261 "movi v20.8b, #255 \n" /* A */ 262 "1: \n" 263 READYUV422 264 YUVTORGB(v23, v22, v21) 265 "subs %w4, %w4, #8 \n" 266 MEMACCESS(3) 267 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" 268 "b.gt 1b \n" 269 : "+r"(src_y), // %0 270 "+r"(src_u), // %1 271 "+r"(src_v), // %2 272 "+r"(dst_rgba), // %3 273 "+r"(width) // %4 274 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 275 [kUVToG]"r"(&yuvconstants->kUVToG), 276 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 277 [kYToRgb]"r"(&yuvconstants->kYToRgb) 278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 279 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 280 ); 281} 282 283void I422ToRGB24Row_NEON(const uint8* src_y, 284 const uint8* src_u, 285 const uint8* src_v, 286 uint8* dst_rgb24, 287 const struct YuvConstants* yuvconstants, 288 int width) { 289 asm volatile ( 290 YUVTORGB_SETUP 291 "1: \n" 292 READYUV422 293 YUVTORGB(v22, v21, v20) 294 "subs %w4, %w4, #8 \n" 295 MEMACCESS(3) 296 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" 297 "b.gt 1b \n" 298 : "+r"(src_y), // %0 299 "+r"(src_u), // %1 300 "+r"(src_v), // %2 301 "+r"(dst_rgb24), // %3 302 "+r"(width) // %4 303 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 304 [kUVToG]"r"(&yuvconstants->kUVToG), 305 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 306 [kYToRgb]"r"(&yuvconstants->kYToRgb) 307 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 308 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 309 ); 310} 311 312#define ARGBTORGB565 \ 313 "shll v0.8h, v22.8b, #8 \n" /* R */ \ 314 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 315 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 316 "sri v0.8h, v21.8h, #5 \n" /* RG */ \ 317 "sri v0.8h, v20.8h, #11 \n" /* RGB */ 318 319void I422ToRGB565Row_NEON(const uint8* src_y, 320 const uint8* src_u, 321 const uint8* src_v, 322 uint8* dst_rgb565, 323 const struct YuvConstants* yuvconstants, 324 int width) { 325 asm volatile ( 326 YUVTORGB_SETUP 327 "1: \n" 328 READYUV422 329 YUVTORGB(v22, v21, v20) 330 "subs %w4, %w4, #8 \n" 331 ARGBTORGB565 332 MEMACCESS(3) 333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 334 "b.gt 1b \n" 335 : "+r"(src_y), // %0 336 "+r"(src_u), // %1 337 "+r"(src_v), // %2 338 "+r"(dst_rgb565), // %3 339 "+r"(width) // %4 340 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 341 [kUVToG]"r"(&yuvconstants->kUVToG), 342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 343 [kYToRgb]"r"(&yuvconstants->kYToRgb) 344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 346 ); 347} 348 349#define ARGBTOARGB1555 \ 350 "shll v0.8h, v23.8b, #8 \n" /* A */ \ 351 "shll v22.8h, v22.8b, #8 \n" /* R */ \ 352 "shll v21.8h, v21.8b, #8 \n" /* G */ \ 353 "shll v20.8h, v20.8b, #8 \n" /* B */ \ 354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \ 355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ 356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */ 357 358void I422ToARGB1555Row_NEON(const uint8* src_y, 359 const uint8* src_u, 360 const uint8* src_v, 361 uint8* dst_argb1555, 362 const struct YuvConstants* yuvconstants, 363 int width) { 364 asm volatile ( 365 YUVTORGB_SETUP 366 "movi v23.8b, #255 \n" 367 "1: \n" 368 READYUV422 369 YUVTORGB(v22, v21, v20) 370 "subs %w4, %w4, #8 \n" 371 ARGBTOARGB1555 372 MEMACCESS(3) 373 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. 374 "b.gt 1b \n" 375 : "+r"(src_y), // %0 376 "+r"(src_u), // %1 377 "+r"(src_v), // %2 378 "+r"(dst_argb1555), // %3 379 "+r"(width) // %4 380 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 381 [kUVToG]"r"(&yuvconstants->kUVToG), 382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 383 [kYToRgb]"r"(&yuvconstants->kYToRgb) 384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 386 ); 387} 388 389#define ARGBTOARGB4444 \ 390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ 391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \ 392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \ 393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \ 394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \ 395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \ 396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ 397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ 398 399void I422ToARGB4444Row_NEON(const uint8* src_y, 400 const uint8* src_u, 401 const uint8* src_v, 402 uint8* dst_argb4444, 403 const struct YuvConstants* yuvconstants, 404 int width) { 405 asm volatile ( 406 YUVTORGB_SETUP 407 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 408 "1: \n" 409 READYUV422 410 YUVTORGB(v22, v21, v20) 411 "subs %w4, %w4, #8 \n" 412 "movi v23.8b, #255 \n" 413 ARGBTOARGB4444 414 MEMACCESS(3) 415 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. 416 "b.gt 1b \n" 417 : "+r"(src_y), // %0 418 "+r"(src_u), // %1 419 "+r"(src_v), // %2 420 "+r"(dst_argb4444), // %3 421 "+r"(width) // %4 422 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 423 [kUVToG]"r"(&yuvconstants->kUVToG), 424 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 425 [kYToRgb]"r"(&yuvconstants->kYToRgb) 426 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 427 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 428 ); 429} 430 431void I400ToARGBRow_NEON(const uint8* src_y, 432 uint8* dst_argb, 433 int width) { 434 asm volatile ( 435 YUVTORGB_SETUP 436 "movi v23.8b, #255 \n" 437 "1: \n" 438 READYUV400 439 YUVTORGB(v22, v21, v20) 440 "subs %w2, %w2, #8 \n" 441 MEMACCESS(1) 442 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 443 "b.gt 1b \n" 444 : "+r"(src_y), // %0 445 "+r"(dst_argb), // %1 446 "+r"(width) // %2 447 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), 448 [kUVToG]"r"(&kYuvI601Constants.kUVToG), 449 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), 450 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) 451 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 452 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 453 ); 454} 455 456void J400ToARGBRow_NEON(const uint8* src_y, 457 uint8* dst_argb, 458 int width) { 459 asm volatile ( 460 "movi v23.8b, #255 \n" 461 "1: \n" 462 MEMACCESS(0) 463 "ld1 {v20.8b}, [%0], #8 \n" 464 "orr v21.8b, v20.8b, v20.8b \n" 465 "orr v22.8b, v20.8b, v20.8b \n" 466 "subs %w2, %w2, #8 \n" 467 MEMACCESS(1) 468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 469 "b.gt 1b \n" 470 : "+r"(src_y), // %0 471 "+r"(dst_argb), // %1 472 "+r"(width) // %2 473 : 474 : "cc", "memory", "v20", "v21", "v22", "v23" 475 ); 476} 477 478void NV12ToARGBRow_NEON(const uint8* src_y, 479 const uint8* src_uv, 480 uint8* dst_argb, 481 const struct YuvConstants* yuvconstants, 482 int width) { 483 asm volatile ( 484 YUVTORGB_SETUP 485 "movi v23.8b, #255 \n" 486 "1: \n" 487 READNV12 488 YUVTORGB(v22, v21, v20) 489 "subs %w3, %w3, #8 \n" 490 MEMACCESS(2) 491 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 492 "b.gt 1b \n" 493 : "+r"(src_y), // %0 494 "+r"(src_uv), // %1 495 "+r"(dst_argb), // %2 496 "+r"(width) // %3 497 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 498 [kUVToG]"r"(&yuvconstants->kUVToG), 499 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 500 [kYToRgb]"r"(&yuvconstants->kYToRgb) 501 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 502 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 503 ); 504} 505 506void NV21ToARGBRow_NEON(const uint8* src_y, 507 const uint8* src_vu, 508 uint8* dst_argb, 509 const struct YuvConstants* yuvconstants, 510 int width) { 511 asm volatile ( 512 YUVTORGB_SETUP 513 "movi v23.8b, #255 \n" 514 "1: \n" 515 READNV21 516 YUVTORGB(v22, v21, v20) 517 "subs %w3, %w3, #8 \n" 518 MEMACCESS(2) 519 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" 520 "b.gt 1b \n" 521 : "+r"(src_y), // %0 522 "+r"(src_vu), // %1 523 "+r"(dst_argb), // %2 524 "+r"(width) // %3 525 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 526 [kUVToG]"r"(&yuvconstants->kUVToG), 527 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 528 [kYToRgb]"r"(&yuvconstants->kYToRgb) 529 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 530 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 531 ); 532} 533 534void NV12ToRGB565Row_NEON(const uint8* src_y, 535 const uint8* src_uv, 536 uint8* dst_rgb565, 537 const struct YuvConstants* yuvconstants, 538 int width) { 539 asm volatile ( 540 YUVTORGB_SETUP 541 "1: \n" 542 READNV12 543 YUVTORGB(v22, v21, v20) 544 "subs %w3, %w3, #8 \n" 545 ARGBTORGB565 546 MEMACCESS(2) 547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. 548 "b.gt 1b \n" 549 : "+r"(src_y), // %0 550 "+r"(src_uv), // %1 551 "+r"(dst_rgb565), // %2 552 "+r"(width) // %3 553 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 554 [kUVToG]"r"(&yuvconstants->kUVToG), 555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 556 [kYToRgb]"r"(&yuvconstants->kYToRgb) 557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 559 ); 560} 561 562void YUY2ToARGBRow_NEON(const uint8* src_yuy2, 563 uint8* dst_argb, 564 const struct YuvConstants* yuvconstants, 565 int width) { 566 asm volatile ( 567 YUVTORGB_SETUP 568 "movi v23.8b, #255 \n" 569 "1: \n" 570 READYUY2 571 YUVTORGB(v22, v21, v20) 572 "subs %w2, %w2, #8 \n" 573 MEMACCESS(1) 574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" 575 "b.gt 1b \n" 576 : "+r"(src_yuy2), // %0 577 "+r"(dst_argb), // %1 578 "+r"(width) // %2 579 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 580 [kUVToG]"r"(&yuvconstants->kUVToG), 581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 582 [kYToRgb]"r"(&yuvconstants->kYToRgb) 583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 585 ); 586} 587 588void UYVYToARGBRow_NEON(const uint8* src_uyvy, 589 uint8* dst_argb, 590 const struct YuvConstants* yuvconstants, 591 int width) { 592 asm volatile ( 593 YUVTORGB_SETUP 594 "movi v23.8b, #255 \n" 595 "1: \n" 596 READUYVY 597 YUVTORGB(v22, v21, v20) 598 "subs %w2, %w2, #8 \n" 599 MEMACCESS(1) 600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" 601 "b.gt 1b \n" 602 : "+r"(src_uyvy), // %0 603 "+r"(dst_argb), // %1 604 "+r"(width) // %2 605 : [kUVToRB]"r"(&yuvconstants->kUVToRB), 606 [kUVToG]"r"(&yuvconstants->kUVToG), 607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), 608 [kYToRgb]"r"(&yuvconstants->kYToRgb) 609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", 610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" 611 ); 612} 613 614// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. 615void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 616 int width) { 617 asm volatile ( 618 "1: \n" 619 MEMACCESS(0) 620 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV 621 "subs %w3, %w3, #16 \n" // 16 processed per loop 622 MEMACCESS(1) 623 "st1 {v0.16b}, [%1], #16 \n" // store U 624 MEMACCESS(2) 625 "st1 {v1.16b}, [%2], #16 \n" // store V 626 "b.gt 1b \n" 627 : "+r"(src_uv), // %0 628 "+r"(dst_u), // %1 629 "+r"(dst_v), // %2 630 "+r"(width) // %3 // Output registers 631 : // Input registers 632 : "cc", "memory", "v0", "v1" // Clobber List 633 ); 634} 635 636// Reads 16 U's and V's and writes out 16 pairs of UV. 637void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 638 int width) { 639 asm volatile ( 640 "1: \n" 641 MEMACCESS(0) 642 "ld1 {v0.16b}, [%0], #16 \n" // load U 643 MEMACCESS(1) 644 "ld1 {v1.16b}, [%1], #16 \n" // load V 645 "subs %w3, %w3, #16 \n" // 16 processed per loop 646 MEMACCESS(2) 647 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV 648 "b.gt 1b \n" 649 : 650 "+r"(src_u), // %0 651 "+r"(src_v), // %1 652 "+r"(dst_uv), // %2 653 "+r"(width) // %3 // Output registers 654 : // Input registers 655 : "cc", "memory", "v0", "v1" // Clobber List 656 ); 657} 658 659// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. 660void CopyRow_NEON(const uint8* src, uint8* dst, int count) { 661 asm volatile ( 662 "1: \n" 663 MEMACCESS(0) 664 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 665 "subs %w2, %w2, #32 \n" // 32 processed per loop 666 MEMACCESS(1) 667 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 668 "b.gt 1b \n" 669 : "+r"(src), // %0 670 "+r"(dst), // %1 671 "+r"(count) // %2 // Output registers 672 : // Input registers 673 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 674 ); 675} 676 677// SetRow writes 'count' bytes using an 8 bit value repeated. 678void SetRow_NEON(uint8* dst, uint8 v8, int count) { 679 asm volatile ( 680 "dup v0.16b, %w2 \n" // duplicate 16 bytes 681 "1: \n" 682 "subs %w1, %w1, #16 \n" // 16 bytes per loop 683 MEMACCESS(0) 684 "st1 {v0.16b}, [%0], #16 \n" // store 685 "b.gt 1b \n" 686 : "+r"(dst), // %0 687 "+r"(count) // %1 688 : "r"(v8) // %2 689 : "cc", "memory", "v0" 690 ); 691} 692 693void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { 694 asm volatile ( 695 "dup v0.4s, %w2 \n" // duplicate 4 ints 696 "1: \n" 697 "subs %w1, %w1, #4 \n" // 4 ints per loop 698 MEMACCESS(0) 699 "st1 {v0.16b}, [%0], #16 \n" // store 700 "b.gt 1b \n" 701 : "+r"(dst), // %0 702 "+r"(count) // %1 703 : "r"(v32) // %2 704 : "cc", "memory", "v0" 705 ); 706} 707 708void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { 709 asm volatile ( 710 // Start at end of source row. 711 "add %0, %0, %w2, sxtw \n" 712 "sub %0, %0, #16 \n" 713 "1: \n" 714 MEMACCESS(0) 715 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 716 "subs %w2, %w2, #16 \n" // 16 pixels per loop. 717 "rev64 v0.16b, v0.16b \n" 718 MEMACCESS(1) 719 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 720 MEMACCESS(1) 721 "st1 {v0.D}[0], [%1], #8 \n" 722 "b.gt 1b \n" 723 : "+r"(src), // %0 724 "+r"(dst), // %1 725 "+r"(width) // %2 726 : "r"((ptrdiff_t)-16) // %3 727 : "cc", "memory", "v0" 728 ); 729} 730 731void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 732 int width) { 733 asm volatile ( 734 // Start at end of source row. 735 "add %0, %0, %w3, sxtw #1 \n" 736 "sub %0, %0, #16 \n" 737 "1: \n" 738 MEMACCESS(0) 739 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 740 "subs %w3, %w3, #8 \n" // 8 pixels per loop. 741 "rev64 v0.8b, v0.8b \n" 742 "rev64 v1.8b, v1.8b \n" 743 MEMACCESS(1) 744 "st1 {v0.8b}, [%1], #8 \n" // dst += 8 745 MEMACCESS(2) 746 "st1 {v1.8b}, [%2], #8 \n" 747 "b.gt 1b \n" 748 : "+r"(src_uv), // %0 749 "+r"(dst_u), // %1 750 "+r"(dst_v), // %2 751 "+r"(width) // %3 752 : "r"((ptrdiff_t)-16) // %4 753 : "cc", "memory", "v0", "v1" 754 ); 755} 756 757void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { 758 asm volatile ( 759 // Start at end of source row. 760 "add %0, %0, %w2, sxtw #2 \n" 761 "sub %0, %0, #16 \n" 762 "1: \n" 763 MEMACCESS(0) 764 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 765 "subs %w2, %w2, #4 \n" // 4 pixels per loop. 766 "rev64 v0.4s, v0.4s \n" 767 MEMACCESS(1) 768 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 769 MEMACCESS(1) 770 "st1 {v0.D}[0], [%1], #8 \n" 771 "b.gt 1b \n" 772 : "+r"(src), // %0 773 "+r"(dst), // %1 774 "+r"(width) // %2 775 : "r"((ptrdiff_t)-16) // %3 776 : "cc", "memory", "v0" 777 ); 778} 779 780void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { 781 asm volatile ( 782 "movi v4.8b, #255 \n" // Alpha 783 "1: \n" 784 MEMACCESS(0) 785 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. 786 "subs %w2, %w2, #8 \n" // 8 processed per loop. 787 MEMACCESS(1) 788 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels 789 "b.gt 1b \n" 790 : "+r"(src_rgb24), // %0 791 "+r"(dst_argb), // %1 792 "+r"(width) // %2 793 : 794 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 795 ); 796} 797 798void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { 799 asm volatile ( 800 "movi v5.8b, #255 \n" // Alpha 801 "1: \n" 802 MEMACCESS(0) 803 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 804 "subs %w2, %w2, #8 \n" // 8 processed per loop. 805 "orr v3.8b, v1.8b, v1.8b \n" // move g 806 "orr v4.8b, v0.8b, v0.8b \n" // move r 807 MEMACCESS(1) 808 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a 809 "b.gt 1b \n" 810 : "+r"(src_raw), // %0 811 "+r"(dst_argb), // %1 812 "+r"(width) // %2 813 : 814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List 815 ); 816} 817 818void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { 819 asm volatile ( 820 "1: \n" 821 MEMACCESS(0) 822 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b 823 "subs %w2, %w2, #8 \n" // 8 processed per loop. 824 "orr v3.8b, v1.8b, v1.8b \n" // move g 825 "orr v4.8b, v0.8b, v0.8b \n" // move r 826 MEMACCESS(1) 827 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r 828 "b.gt 1b \n" 829 : "+r"(src_raw), // %0 830 "+r"(dst_rgb24), // %1 831 "+r"(width) // %2 832 : 833 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 834 ); 835} 836 837#define RGB565TOARGB \ 838 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ 839 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ 840 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ 841 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ 842 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 843 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ 844 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ 845 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ 846 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ 847 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ 848 "dup v2.2D, v0.D[1] \n" /* R */ 849 850void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { 851 asm volatile ( 852 "movi v3.8b, #255 \n" // Alpha 853 "1: \n" 854 MEMACCESS(0) 855 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 856 "subs %w2, %w2, #8 \n" // 8 processed per loop. 857 RGB565TOARGB 858 MEMACCESS(1) 859 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 860 "b.gt 1b \n" 861 : "+r"(src_rgb565), // %0 862 "+r"(dst_argb), // %1 863 "+r"(width) // %2 864 : 865 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List 866 ); 867} 868 869#define ARGB1555TOARGB \ 870 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 871 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 872 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ 873 \ 874 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ 875 "xtn2 v3.16b, v2.8h \n" \ 876 \ 877 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 878 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 879 \ 880 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ 881 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 882 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 883 \ 884 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 885 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ 886 "dup v1.2D, v0.D[1] \n" \ 887 "dup v3.2D, v2.D[1] \n" 888 889// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. 890#define RGB555TOARGB \ 891 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ 892 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ 893 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ 894 \ 895 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ 896 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ 897 \ 898 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ 899 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ 900 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ 901 \ 902 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ 903 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ 904 "dup v1.2D, v0.D[1] \n" /* G */ \ 905 906void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, 907 int width) { 908 asm volatile ( 909 "movi v3.8b, #255 \n" // Alpha 910 "1: \n" 911 MEMACCESS(0) 912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 913 "subs %w2, %w2, #8 \n" // 8 processed per loop. 914 ARGB1555TOARGB 915 MEMACCESS(1) 916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 917 "b.gt 1b \n" 918 : "+r"(src_argb1555), // %0 919 "+r"(dst_argb), // %1 920 "+r"(width) // %2 921 : 922 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 923 ); 924} 925 926#define ARGB4444TOARGB \ 927 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ 928 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ 929 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ 930 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ 931 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ 932 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ 933 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ 934 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ 935 "dup v0.2D, v2.D[1] \n" \ 936 "dup v1.2D, v3.D[1] \n" 937 938void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, 939 int width) { 940 asm volatile ( 941 "1: \n" 942 MEMACCESS(0) 943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 944 "subs %w2, %w2, #8 \n" // 8 processed per loop. 945 ARGB4444TOARGB 946 MEMACCESS(1) 947 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 948 "b.gt 1b \n" 949 : "+r"(src_argb4444), // %0 950 "+r"(dst_argb), // %1 951 "+r"(width) // %2 952 : 953 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List 954 ); 955} 956 957void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { 958 asm volatile ( 959 "1: \n" 960 MEMACCESS(0) 961 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels 962 "subs %w2, %w2, #8 \n" // 8 processed per loop. 963 MEMACCESS(1) 964 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. 965 "b.gt 1b \n" 966 : "+r"(src_argb), // %0 967 "+r"(dst_rgb24), // %1 968 "+r"(width) // %2 969 : 970 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List 971 ); 972} 973 974void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { 975 asm volatile ( 976 "1: \n" 977 MEMACCESS(0) 978 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a 979 "subs %w2, %w2, #8 \n" // 8 processed per loop. 980 "orr v4.8b, v2.8b, v2.8b \n" // mov g 981 "orr v5.8b, v1.8b, v1.8b \n" // mov b 982 MEMACCESS(1) 983 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b 984 "b.gt 1b \n" 985 : "+r"(src_argb), // %0 986 "+r"(dst_raw), // %1 987 "+r"(width) // %2 988 : 989 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List 990 ); 991} 992 993void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { 994 asm volatile ( 995 "1: \n" 996 MEMACCESS(0) 997 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. 998 "subs %w2, %w2, #16 \n" // 16 processed per loop. 999 MEMACCESS(1) 1000 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. 1001 "b.gt 1b \n" 1002 : "+r"(src_yuy2), // %0 1003 "+r"(dst_y), // %1 1004 "+r"(width) // %2 1005 : 1006 : "cc", "memory", "v0", "v1" // Clobber List 1007 ); 1008} 1009 1010void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { 1011 asm volatile ( 1012 "1: \n" 1013 MEMACCESS(0) 1014 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. 1015 "subs %w2, %w2, #16 \n" // 16 processed per loop. 1016 MEMACCESS(1) 1017 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. 1018 "b.gt 1b \n" 1019 : "+r"(src_uyvy), // %0 1020 "+r"(dst_y), // %1 1021 "+r"(width) // %2 1022 : 1023 : "cc", "memory", "v0", "v1" // Clobber List 1024 ); 1025} 1026 1027void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, 1028 int width) { 1029 asm volatile ( 1030 "1: \n" 1031 MEMACCESS(0) 1032 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels 1033 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1034 MEMACCESS(1) 1035 "st1 {v1.8b}, [%1], #8 \n" // store 8 U. 1036 MEMACCESS(2) 1037 "st1 {v3.8b}, [%2], #8 \n" // store 8 V. 1038 "b.gt 1b \n" 1039 : "+r"(src_yuy2), // %0 1040 "+r"(dst_u), // %1 1041 "+r"(dst_v), // %2 1042 "+r"(width) // %3 1043 : 1044 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1045 ); 1046} 1047 1048void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, 1049 int width) { 1050 asm volatile ( 1051 "1: \n" 1052 MEMACCESS(0) 1053 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels 1054 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. 1055 MEMACCESS(1) 1056 "st1 {v0.8b}, [%1], #8 \n" // store 8 U. 1057 MEMACCESS(2) 1058 "st1 {v2.8b}, [%2], #8 \n" // store 8 V. 1059 "b.gt 1b \n" 1060 : "+r"(src_uyvy), // %0 1061 "+r"(dst_u), // %1 1062 "+r"(dst_v), // %2 1063 "+r"(width) // %3 1064 : 1065 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1066 ); 1067} 1068 1069void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, 1070 uint8* dst_u, uint8* dst_v, int width) { 1071 const uint8* src_yuy2b = src_yuy2 + stride_yuy2; 1072 asm volatile ( 1073 "1: \n" 1074 MEMACCESS(0) 1075 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1076 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1077 MEMACCESS(1) 1078 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1079 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U 1080 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V 1081 MEMACCESS(2) 1082 "st1 {v1.8b}, [%2], #8 \n" // store 8 U. 1083 MEMACCESS(3) 1084 "st1 {v3.8b}, [%3], #8 \n" // store 8 V. 1085 "b.gt 1b \n" 1086 : "+r"(src_yuy2), // %0 1087 "+r"(src_yuy2b), // %1 1088 "+r"(dst_u), // %2 1089 "+r"(dst_v), // %3 1090 "+r"(width) // %4 1091 : 1092 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1093 "v5", "v6", "v7" // Clobber List 1094 ); 1095} 1096 1097void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, 1098 uint8* dst_u, uint8* dst_v, int width) { 1099 const uint8* src_uyvyb = src_uyvy + stride_uyvy; 1100 asm volatile ( 1101 "1: \n" 1102 MEMACCESS(0) 1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels 1104 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. 1105 MEMACCESS(1) 1106 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row 1107 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U 1108 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V 1109 MEMACCESS(2) 1110 "st1 {v0.8b}, [%2], #8 \n" // store 8 U. 1111 MEMACCESS(3) 1112 "st1 {v2.8b}, [%3], #8 \n" // store 8 V. 1113 "b.gt 1b \n" 1114 : "+r"(src_uyvy), // %0 1115 "+r"(src_uyvyb), // %1 1116 "+r"(dst_u), // %2 1117 "+r"(dst_v), // %3 1118 "+r"(width) // %4 1119 : 1120 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1121 "v5", "v6", "v7" // Clobber List 1122 ); 1123} 1124 1125// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1126void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, 1127 const uint8* shuffler, int width) { 1128 asm volatile ( 1129 MEMACCESS(3) 1130 "ld1 {v2.16b}, [%3] \n" // shuffler 1131 "1: \n" 1132 MEMACCESS(0) 1133 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. 1134 "subs %w2, %w2, #4 \n" // 4 processed per loop 1135 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels 1136 MEMACCESS(1) 1137 "st1 {v1.16b}, [%1], #16 \n" // store 4. 1138 "b.gt 1b \n" 1139 : "+r"(src_argb), // %0 1140 "+r"(dst_argb), // %1 1141 "+r"(width) // %2 1142 : "r"(shuffler) // %3 1143 : "cc", "memory", "v0", "v1", "v2" // Clobber List 1144 ); 1145} 1146 1147void I422ToYUY2Row_NEON(const uint8* src_y, 1148 const uint8* src_u, 1149 const uint8* src_v, 1150 uint8* dst_yuy2, int width) { 1151 asm volatile ( 1152 "1: \n" 1153 MEMACCESS(0) 1154 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys 1155 "orr v2.8b, v1.8b, v1.8b \n" 1156 MEMACCESS(1) 1157 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us 1158 MEMACCESS(2) 1159 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs 1160 "subs %w4, %w4, #16 \n" // 16 pixels 1161 MEMACCESS(3) 1162 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1163 "b.gt 1b \n" 1164 : "+r"(src_y), // %0 1165 "+r"(src_u), // %1 1166 "+r"(src_v), // %2 1167 "+r"(dst_yuy2), // %3 1168 "+r"(width) // %4 1169 : 1170 : "cc", "memory", "v0", "v1", "v2", "v3" 1171 ); 1172} 1173 1174void I422ToUYVYRow_NEON(const uint8* src_y, 1175 const uint8* src_u, 1176 const uint8* src_v, 1177 uint8* dst_uyvy, int width) { 1178 asm volatile ( 1179 "1: \n" 1180 MEMACCESS(0) 1181 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys 1182 "orr v3.8b, v2.8b, v2.8b \n" 1183 MEMACCESS(1) 1184 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us 1185 MEMACCESS(2) 1186 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs 1187 "subs %w4, %w4, #16 \n" // 16 pixels 1188 MEMACCESS(3) 1189 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. 1190 "b.gt 1b \n" 1191 : "+r"(src_y), // %0 1192 "+r"(src_u), // %1 1193 "+r"(src_v), // %2 1194 "+r"(dst_uyvy), // %3 1195 "+r"(width) // %4 1196 : 1197 : "cc", "memory", "v0", "v1", "v2", "v3" 1198 ); 1199} 1200 1201void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { 1202 asm volatile ( 1203 "1: \n" 1204 MEMACCESS(0) 1205 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1206 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1207 ARGBTORGB565 1208 MEMACCESS(1) 1209 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. 1210 "b.gt 1b \n" 1211 : "+r"(src_argb), // %0 1212 "+r"(dst_rgb565), // %1 1213 "+r"(width) // %2 1214 : 1215 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1216 ); 1217} 1218 1219void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, 1220 const uint32 dither4, int width) { 1221 asm volatile ( 1222 "dup v1.4s, %w2 \n" // dither4 1223 "1: \n" 1224 MEMACCESS(1) 1225 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels 1226 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1227 "uqadd v20.8b, v20.8b, v1.8b \n" 1228 "uqadd v21.8b, v21.8b, v1.8b \n" 1229 "uqadd v22.8b, v22.8b, v1.8b \n" 1230 ARGBTORGB565 1231 MEMACCESS(0) 1232 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. 1233 "b.gt 1b \n" 1234 : "+r"(dst_rgb) // %0 1235 : "r"(src_argb), // %1 1236 "r"(dither4), // %2 1237 "r"(width) // %3 1238 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" 1239 ); 1240} 1241 1242void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, 1243 int width) { 1244 asm volatile ( 1245 "1: \n" 1246 MEMACCESS(0) 1247 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1248 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1249 ARGBTOARGB1555 1250 MEMACCESS(1) 1251 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. 1252 "b.gt 1b \n" 1253 : "+r"(src_argb), // %0 1254 "+r"(dst_argb1555), // %1 1255 "+r"(width) // %2 1256 : 1257 : "cc", "memory", "v0", "v20", "v21", "v22", "v23" 1258 ); 1259} 1260 1261void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, 1262 int width) { 1263 asm volatile ( 1264 "movi v4.16b, #0x0f \n" // bits to clear with vbic. 1265 "1: \n" 1266 MEMACCESS(0) 1267 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels 1268 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1269 ARGBTOARGB4444 1270 MEMACCESS(1) 1271 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. 1272 "b.gt 1b \n" 1273 : "+r"(src_argb), // %0 1274 "+r"(dst_argb4444), // %1 1275 "+r"(width) // %2 1276 : 1277 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" 1278 ); 1279} 1280 1281void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1282 asm volatile ( 1283 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1285 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1286 "movi v7.8b, #16 \n" // Add 16 constant 1287 "1: \n" 1288 MEMACCESS(0) 1289 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1290 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1291 "umull v3.8h, v0.8b, v4.8b \n" // B 1292 "umlal v3.8h, v1.8b, v5.8b \n" // G 1293 "umlal v3.8h, v2.8b, v6.8b \n" // R 1294 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1295 "uqadd v0.8b, v0.8b, v7.8b \n" 1296 MEMACCESS(1) 1297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1298 "b.gt 1b \n" 1299 : "+r"(src_argb), // %0 1300 "+r"(dst_y), // %1 1301 "+r"(width) // %2 1302 : 1303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 1304 ); 1305} 1306 1307void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { 1308 asm volatile ( 1309 "1: \n" 1310 MEMACCESS(0) 1311 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels 1312 "subs %w2, %w2, #16 \n" // 16 processed per loop 1313 MEMACCESS(1) 1314 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. 1315 "b.gt 1b \n" 1316 : "+r"(src_argb), // %0 1317 "+r"(dst_a), // %1 1318 "+r"(width) // %2 1319 : 1320 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 1321 ); 1322} 1323 1324void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { 1325 asm volatile ( 1326 "movi v4.8b, #15 \n" // B * 0.11400 coefficient 1327 "movi v5.8b, #75 \n" // G * 0.58700 coefficient 1328 "movi v6.8b, #38 \n" // R * 0.29900 coefficient 1329 "1: \n" 1330 MEMACCESS(0) 1331 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1332 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1333 "umull v3.8h, v0.8b, v4.8b \n" // B 1334 "umlal v3.8h, v1.8b, v5.8b \n" // G 1335 "umlal v3.8h, v2.8b, v6.8b \n" // R 1336 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y 1337 MEMACCESS(1) 1338 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1339 "b.gt 1b \n" 1340 : "+r"(src_argb), // %0 1341 "+r"(dst_y), // %1 1342 "+r"(width) // %2 1343 : 1344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 1345 ); 1346} 1347 1348// 8x1 pixels. 1349void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1350 int width) { 1351 asm volatile ( 1352 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient 1353 "movi v25.8b, #74 \n" // UG -0.5781 coefficient 1354 "movi v26.8b, #38 \n" // UR -0.2969 coefficient 1355 "movi v27.8b, #18 \n" // VB -0.1406 coefficient 1356 "movi v28.8b, #94 \n" // VG -0.7344 coefficient 1357 "movi v29.16b,#0x80 \n" // 128.5 1358 "1: \n" 1359 MEMACCESS(0) 1360 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 1361 "subs %w3, %w3, #8 \n" // 8 processed per loop. 1362 "umull v4.8h, v0.8b, v24.8b \n" // B 1363 "umlsl v4.8h, v1.8b, v25.8b \n" // G 1364 "umlsl v4.8h, v2.8b, v26.8b \n" // R 1365 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned 1366 1367 "umull v3.8h, v2.8b, v24.8b \n" // R 1368 "umlsl v3.8h, v1.8b, v28.8b \n" // G 1369 "umlsl v3.8h, v0.8b, v27.8b \n" // B 1370 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned 1371 1372 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U 1373 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1374 1375 MEMACCESS(1) 1376 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1377 MEMACCESS(2) 1378 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1379 "b.gt 1b \n" 1380 : "+r"(src_argb), // %0 1381 "+r"(dst_u), // %1 1382 "+r"(dst_v), // %2 1383 "+r"(width) // %3 1384 : 1385 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", 1386 "v24", "v25", "v26", "v27", "v28", "v29" 1387 ); 1388} 1389 1390#define RGBTOUV_SETUP_REG \ 1391 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ 1392 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ 1393 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ 1394 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ 1395 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ 1396 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ 1397 1398// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. 1399void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1400 int width) { 1401 asm volatile ( 1402 RGBTOUV_SETUP_REG 1403 "1: \n" 1404 MEMACCESS(0) 1405 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1406 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1407 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1408 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1409 MEMACCESS(0) 1410 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16. 1411 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1412 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1413 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1414 1415 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts. 1416 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts. 1417 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts. 1418 1419 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1420 "urshr v1.8h, v1.8h, #1 \n" 1421 "urshr v2.8h, v2.8h, #1 \n" 1422 1423 "subs %w3, %w3, #32 \n" // 32 processed per loop. 1424 "mul v3.8h, v0.8h, v20.8h \n" // B 1425 "mls v3.8h, v1.8h, v21.8h \n" // G 1426 "mls v3.8h, v2.8h, v22.8h \n" // R 1427 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1428 "mul v4.8h, v2.8h, v20.8h \n" // R 1429 "mls v4.8h, v1.8h, v24.8h \n" // G 1430 "mls v4.8h, v0.8h, v23.8h \n" // B 1431 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned 1432 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U 1433 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V 1434 MEMACCESS(1) 1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. 1436 MEMACCESS(2) 1437 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. 1438 "b.gt 1b \n" 1439 : "+r"(src_argb), // %0 1440 "+r"(dst_u), // %1 1441 "+r"(dst_v), // %2 1442 "+r"(width) // %3 1443 : 1444 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1445 "v20", "v21", "v22", "v23", "v24", "v25" 1446 ); 1447} 1448 1449// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1450#define RGBTOUV(QB, QG, QR) \ 1451 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ 1452 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ 1453 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ 1454 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ 1455 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ 1456 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ 1457 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ 1458 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ 1459 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ 1460 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ 1461 1462// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. 1463// TODO(fbarchard): consider ptrdiff_t for all strides. 1464 1465void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, 1466 uint8* dst_u, uint8* dst_v, int width) { 1467 const uint8* src_argb_1 = src_argb + src_stride_argb; 1468 asm volatile ( 1469 RGBTOUV_SETUP_REG 1470 "1: \n" 1471 MEMACCESS(0) 1472 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1473 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1474 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1475 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1476 1477 MEMACCESS(1) 1478 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1479 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1480 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1481 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1482 1483 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1484 "urshr v1.8h, v1.8h, #1 \n" 1485 "urshr v2.8h, v2.8h, #1 \n" 1486 1487 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1488 RGBTOUV(v0.8h, v1.8h, v2.8h) 1489 MEMACCESS(2) 1490 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1491 MEMACCESS(3) 1492 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1493 "b.gt 1b \n" 1494 : "+r"(src_argb), // %0 1495 "+r"(src_argb_1), // %1 1496 "+r"(dst_u), // %2 1497 "+r"(dst_v), // %3 1498 "+r"(width) // %4 1499 : 1500 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1501 "v20", "v21", "v22", "v23", "v24", "v25" 1502 ); 1503} 1504 1505// TODO(fbarchard): Subsample match C code. 1506void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, 1507 uint8* dst_u, uint8* dst_v, int width) { 1508 const uint8* src_argb_1 = src_argb + src_stride_argb; 1509 asm volatile ( 1510 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 1511 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 1512 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 1513 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 1514 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 1515 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1516 "1: \n" 1517 MEMACCESS(0) 1518 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1519 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1520 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1521 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1522 MEMACCESS(1) 1523 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 1524 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1525 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1526 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1527 1528 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1529 "urshr v1.8h, v1.8h, #1 \n" 1530 "urshr v2.8h, v2.8h, #1 \n" 1531 1532 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1533 RGBTOUV(v0.8h, v1.8h, v2.8h) 1534 MEMACCESS(2) 1535 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1536 MEMACCESS(3) 1537 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1538 "b.gt 1b \n" 1539 : "+r"(src_argb), // %0 1540 "+r"(src_argb_1), // %1 1541 "+r"(dst_u), // %2 1542 "+r"(dst_v), // %3 1543 "+r"(width) // %4 1544 : 1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1546 "v20", "v21", "v22", "v23", "v24", "v25" 1547 ); 1548} 1549 1550void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, 1551 uint8* dst_u, uint8* dst_v, int width) { 1552 const uint8* src_bgra_1 = src_bgra + src_stride_bgra; 1553 asm volatile ( 1554 RGBTOUV_SETUP_REG 1555 "1: \n" 1556 MEMACCESS(0) 1557 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1558 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. 1559 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1560 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. 1561 MEMACCESS(1) 1562 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more 1563 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. 1564 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1565 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. 1566 1567 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1568 "urshr v1.8h, v3.8h, #1 \n" 1569 "urshr v2.8h, v2.8h, #1 \n" 1570 1571 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1572 RGBTOUV(v0.8h, v1.8h, v2.8h) 1573 MEMACCESS(2) 1574 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1575 MEMACCESS(3) 1576 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1577 "b.gt 1b \n" 1578 : "+r"(src_bgra), // %0 1579 "+r"(src_bgra_1), // %1 1580 "+r"(dst_u), // %2 1581 "+r"(dst_v), // %3 1582 "+r"(width) // %4 1583 : 1584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1585 "v20", "v21", "v22", "v23", "v24", "v25" 1586 ); 1587} 1588 1589void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, 1590 uint8* dst_u, uint8* dst_v, int width) { 1591 const uint8* src_abgr_1 = src_abgr + src_stride_abgr; 1592 asm volatile ( 1593 RGBTOUV_SETUP_REG 1594 "1: \n" 1595 MEMACCESS(0) 1596 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1597 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1598 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1599 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1600 MEMACCESS(1) 1601 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1602 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1603 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1604 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1605 1606 "urshr v0.8h, v3.8h, #1 \n" // 2x average 1607 "urshr v2.8h, v2.8h, #1 \n" 1608 "urshr v1.8h, v1.8h, #1 \n" 1609 1610 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1611 RGBTOUV(v0.8h, v2.8h, v1.8h) 1612 MEMACCESS(2) 1613 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1614 MEMACCESS(3) 1615 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1616 "b.gt 1b \n" 1617 : "+r"(src_abgr), // %0 1618 "+r"(src_abgr_1), // %1 1619 "+r"(dst_u), // %2 1620 "+r"(dst_v), // %3 1621 "+r"(width) // %4 1622 : 1623 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1624 "v20", "v21", "v22", "v23", "v24", "v25" 1625 ); 1626} 1627 1628void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, 1629 uint8* dst_u, uint8* dst_v, int width) { 1630 const uint8* src_rgba_1 = src_rgba + src_stride_rgba; 1631 asm volatile ( 1632 RGBTOUV_SETUP_REG 1633 "1: \n" 1634 MEMACCESS(0) 1635 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. 1636 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. 1637 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. 1638 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. 1639 MEMACCESS(1) 1640 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. 1641 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. 1642 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. 1643 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. 1644 1645 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1646 "urshr v1.8h, v1.8h, #1 \n" 1647 "urshr v2.8h, v2.8h, #1 \n" 1648 1649 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1650 RGBTOUV(v0.8h, v1.8h, v2.8h) 1651 MEMACCESS(2) 1652 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1653 MEMACCESS(3) 1654 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1655 "b.gt 1b \n" 1656 : "+r"(src_rgba), // %0 1657 "+r"(src_rgba_1), // %1 1658 "+r"(dst_u), // %2 1659 "+r"(dst_v), // %3 1660 "+r"(width) // %4 1661 : 1662 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1663 "v20", "v21", "v22", "v23", "v24", "v25" 1664 ); 1665} 1666 1667void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, 1668 uint8* dst_u, uint8* dst_v, int width) { 1669 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; 1670 asm volatile ( 1671 RGBTOUV_SETUP_REG 1672 "1: \n" 1673 MEMACCESS(0) 1674 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. 1675 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. 1676 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1677 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. 1678 MEMACCESS(1) 1679 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. 1680 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. 1681 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1682 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. 1683 1684 "urshr v0.8h, v0.8h, #1 \n" // 2x average 1685 "urshr v1.8h, v1.8h, #1 \n" 1686 "urshr v2.8h, v2.8h, #1 \n" 1687 1688 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1689 RGBTOUV(v0.8h, v1.8h, v2.8h) 1690 MEMACCESS(2) 1691 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1692 MEMACCESS(3) 1693 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1694 "b.gt 1b \n" 1695 : "+r"(src_rgb24), // %0 1696 "+r"(src_rgb24_1), // %1 1697 "+r"(dst_u), // %2 1698 "+r"(dst_v), // %3 1699 "+r"(width) // %4 1700 : 1701 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1702 "v20", "v21", "v22", "v23", "v24", "v25" 1703 ); 1704} 1705 1706void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, 1707 uint8* dst_u, uint8* dst_v, int width) { 1708 const uint8* src_raw_1 = src_raw + src_stride_raw; 1709 asm volatile ( 1710 RGBTOUV_SETUP_REG 1711 "1: \n" 1712 MEMACCESS(0) 1713 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. 1714 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. 1715 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. 1716 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. 1717 MEMACCESS(1) 1718 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels 1719 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. 1720 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. 1721 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. 1722 1723 "urshr v2.8h, v2.8h, #1 \n" // 2x average 1724 "urshr v1.8h, v1.8h, #1 \n" 1725 "urshr v0.8h, v0.8h, #1 \n" 1726 1727 "subs %w4, %w4, #16 \n" // 32 processed per loop. 1728 RGBTOUV(v2.8h, v1.8h, v0.8h) 1729 MEMACCESS(2) 1730 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1731 MEMACCESS(3) 1732 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1733 "b.gt 1b \n" 1734 : "+r"(src_raw), // %0 1735 "+r"(src_raw_1), // %1 1736 "+r"(dst_u), // %2 1737 "+r"(dst_v), // %3 1738 "+r"(width) // %4 1739 : 1740 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1741 "v20", "v21", "v22", "v23", "v24", "v25" 1742 ); 1743} 1744 1745// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1746void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, 1747 uint8* dst_u, uint8* dst_v, int width) { 1748 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; 1749 asm volatile ( 1750 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 1751 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2 1752 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 1753 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 1754 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 1755 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) 1756 "1: \n" 1757 MEMACCESS(0) 1758 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1759 RGB565TOARGB 1760 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1761 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1762 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1763 MEMACCESS(0) 1764 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. 1765 RGB565TOARGB 1766 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1767 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1768 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1769 1770 MEMACCESS(1) 1771 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. 1772 RGB565TOARGB 1773 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1774 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1775 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1776 MEMACCESS(1) 1777 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. 1778 RGB565TOARGB 1779 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1780 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1781 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1782 1783 "ins v16.D[1], v17.D[0] \n" 1784 "ins v18.D[1], v19.D[0] \n" 1785 "ins v20.D[1], v21.D[0] \n" 1786 1787 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1788 "urshr v5.8h, v18.8h, #1 \n" 1789 "urshr v6.8h, v20.8h, #1 \n" 1790 1791 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1792 "mul v16.8h, v4.8h, v22.8h \n" // B 1793 "mls v16.8h, v5.8h, v23.8h \n" // G 1794 "mls v16.8h, v6.8h, v24.8h \n" // R 1795 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned 1796 "mul v17.8h, v6.8h, v22.8h \n" // R 1797 "mls v17.8h, v5.8h, v26.8h \n" // G 1798 "mls v17.8h, v4.8h, v25.8h \n" // B 1799 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned 1800 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U 1801 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V 1802 MEMACCESS(2) 1803 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1804 MEMACCESS(3) 1805 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1806 "b.gt 1b \n" 1807 : "+r"(src_rgb565), // %0 1808 "+r"(src_rgb565_1), // %1 1809 "+r"(dst_u), // %2 1810 "+r"(dst_v), // %3 1811 "+r"(width) // %4 1812 : 1813 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 1814 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", 1815 "v25", "v26", "v27" 1816 ); 1817} 1818 1819// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1820void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, 1821 uint8* dst_u, uint8* dst_v, int width) { 1822 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; 1823 asm volatile ( 1824 RGBTOUV_SETUP_REG 1825 "1: \n" 1826 MEMACCESS(0) 1827 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1828 RGB555TOARGB 1829 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1830 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1831 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1832 MEMACCESS(0) 1833 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. 1834 RGB555TOARGB 1835 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1836 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1837 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1838 1839 MEMACCESS(1) 1840 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. 1841 RGB555TOARGB 1842 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1843 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1844 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1845 MEMACCESS(1) 1846 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. 1847 RGB555TOARGB 1848 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1849 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1850 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1851 1852 "ins v16.D[1], v26.D[0] \n" 1853 "ins v17.D[1], v27.D[0] \n" 1854 "ins v18.D[1], v28.D[0] \n" 1855 1856 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1857 "urshr v5.8h, v17.8h, #1 \n" 1858 "urshr v6.8h, v18.8h, #1 \n" 1859 1860 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1861 "mul v2.8h, v4.8h, v20.8h \n" // B 1862 "mls v2.8h, v5.8h, v21.8h \n" // G 1863 "mls v2.8h, v6.8h, v22.8h \n" // R 1864 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1865 "mul v3.8h, v6.8h, v20.8h \n" // R 1866 "mls v3.8h, v5.8h, v24.8h \n" // G 1867 "mls v3.8h, v4.8h, v23.8h \n" // B 1868 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1869 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1870 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1871 MEMACCESS(2) 1872 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1873 MEMACCESS(3) 1874 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1875 "b.gt 1b \n" 1876 : "+r"(src_argb1555), // %0 1877 "+r"(src_argb1555_1), // %1 1878 "+r"(dst_u), // %2 1879 "+r"(dst_v), // %3 1880 "+r"(width) // %4 1881 : 1882 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1883 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1884 "v26", "v27", "v28" 1885 ); 1886} 1887 1888// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. 1889void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, 1890 uint8* dst_u, uint8* dst_v, int width) { 1891 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; 1892 asm volatile ( 1893 RGBTOUV_SETUP_REG 1894 "1: \n" 1895 MEMACCESS(0) 1896 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 1897 ARGB4444TOARGB 1898 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1899 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1900 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1901 MEMACCESS(0) 1902 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. 1903 ARGB4444TOARGB 1904 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1905 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1906 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1907 1908 MEMACCESS(1) 1909 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. 1910 ARGB4444TOARGB 1911 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1912 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1913 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1914 MEMACCESS(1) 1915 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. 1916 ARGB4444TOARGB 1917 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. 1918 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. 1919 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. 1920 1921 "ins v16.D[1], v26.D[0] \n" 1922 "ins v17.D[1], v27.D[0] \n" 1923 "ins v18.D[1], v28.D[0] \n" 1924 1925 "urshr v4.8h, v16.8h, #1 \n" // 2x average 1926 "urshr v5.8h, v17.8h, #1 \n" 1927 "urshr v6.8h, v18.8h, #1 \n" 1928 1929 "subs %w4, %w4, #16 \n" // 16 processed per loop. 1930 "mul v2.8h, v4.8h, v20.8h \n" // B 1931 "mls v2.8h, v5.8h, v21.8h \n" // G 1932 "mls v2.8h, v6.8h, v22.8h \n" // R 1933 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned 1934 "mul v3.8h, v6.8h, v20.8h \n" // R 1935 "mls v3.8h, v5.8h, v24.8h \n" // G 1936 "mls v3.8h, v4.8h, v23.8h \n" // B 1937 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned 1938 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U 1939 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V 1940 MEMACCESS(2) 1941 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. 1942 MEMACCESS(3) 1943 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. 1944 "b.gt 1b \n" 1945 : "+r"(src_argb4444), // %0 1946 "+r"(src_argb4444_1), // %1 1947 "+r"(dst_u), // %2 1948 "+r"(dst_v), // %3 1949 "+r"(width) // %4 1950 : 1951 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", 1952 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", 1953 "v26", "v27", "v28" 1954 1955 ); 1956} 1957 1958void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { 1959 asm volatile ( 1960 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 1961 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 1962 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 1963 "movi v27.8b, #16 \n" // Add 16 constant 1964 "1: \n" 1965 MEMACCESS(0) 1966 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. 1967 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1968 RGB565TOARGB 1969 "umull v3.8h, v0.8b, v24.8b \n" // B 1970 "umlal v3.8h, v1.8b, v25.8b \n" // G 1971 "umlal v3.8h, v2.8b, v26.8b \n" // R 1972 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 1973 "uqadd v0.8b, v0.8b, v27.8b \n" 1974 MEMACCESS(1) 1975 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 1976 "b.gt 1b \n" 1977 : "+r"(src_rgb565), // %0 1978 "+r"(dst_y), // %1 1979 "+r"(width) // %2 1980 : 1981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", 1982 "v24", "v25", "v26", "v27" 1983 ); 1984} 1985 1986void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { 1987 asm volatile ( 1988 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 1989 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 1990 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 1991 "movi v7.8b, #16 \n" // Add 16 constant 1992 "1: \n" 1993 MEMACCESS(0) 1994 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. 1995 "subs %w2, %w2, #8 \n" // 8 processed per loop. 1996 ARGB1555TOARGB 1997 "umull v3.8h, v0.8b, v4.8b \n" // B 1998 "umlal v3.8h, v1.8b, v5.8b \n" // G 1999 "umlal v3.8h, v2.8b, v6.8b \n" // R 2000 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2001 "uqadd v0.8b, v0.8b, v7.8b \n" 2002 MEMACCESS(1) 2003 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2004 "b.gt 1b \n" 2005 : "+r"(src_argb1555), // %0 2006 "+r"(dst_y), // %1 2007 "+r"(width) // %2 2008 : 2009 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2010 ); 2011} 2012 2013void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { 2014 asm volatile ( 2015 "movi v24.8b, #13 \n" // B * 0.1016 coefficient 2016 "movi v25.8b, #65 \n" // G * 0.5078 coefficient 2017 "movi v26.8b, #33 \n" // R * 0.2578 coefficient 2018 "movi v27.8b, #16 \n" // Add 16 constant 2019 "1: \n" 2020 MEMACCESS(0) 2021 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. 2022 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2023 ARGB4444TOARGB 2024 "umull v3.8h, v0.8b, v24.8b \n" // B 2025 "umlal v3.8h, v1.8b, v25.8b \n" // G 2026 "umlal v3.8h, v2.8b, v26.8b \n" // R 2027 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y 2028 "uqadd v0.8b, v0.8b, v27.8b \n" 2029 MEMACCESS(1) 2030 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2031 "b.gt 1b \n" 2032 : "+r"(src_argb4444), // %0 2033 "+r"(dst_y), // %1 2034 "+r"(width) // %2 2035 : 2036 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" 2037 ); 2038} 2039 2040void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { 2041 asm volatile ( 2042 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2043 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2044 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2045 "movi v7.8b, #16 \n" // Add 16 constant 2046 "1: \n" 2047 MEMACCESS(0) 2048 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2049 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2050 "umull v16.8h, v1.8b, v4.8b \n" // R 2051 "umlal v16.8h, v2.8b, v5.8b \n" // G 2052 "umlal v16.8h, v3.8b, v6.8b \n" // B 2053 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2054 "uqadd v0.8b, v0.8b, v7.8b \n" 2055 MEMACCESS(1) 2056 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2057 "b.gt 1b \n" 2058 : "+r"(src_bgra), // %0 2059 "+r"(dst_y), // %1 2060 "+r"(width) // %2 2061 : 2062 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2063 ); 2064} 2065 2066void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { 2067 asm volatile ( 2068 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2069 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2070 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2071 "movi v7.8b, #16 \n" // Add 16 constant 2072 "1: \n" 2073 MEMACCESS(0) 2074 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2075 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2076 "umull v16.8h, v0.8b, v4.8b \n" // R 2077 "umlal v16.8h, v1.8b, v5.8b \n" // G 2078 "umlal v16.8h, v2.8b, v6.8b \n" // B 2079 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2080 "uqadd v0.8b, v0.8b, v7.8b \n" 2081 MEMACCESS(1) 2082 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2083 "b.gt 1b \n" 2084 : "+r"(src_abgr), // %0 2085 "+r"(dst_y), // %1 2086 "+r"(width) // %2 2087 : 2088 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2089 ); 2090} 2091 2092void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { 2093 asm volatile ( 2094 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2095 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2096 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2097 "movi v7.8b, #16 \n" // Add 16 constant 2098 "1: \n" 2099 MEMACCESS(0) 2100 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. 2101 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2102 "umull v16.8h, v1.8b, v4.8b \n" // B 2103 "umlal v16.8h, v2.8b, v5.8b \n" // G 2104 "umlal v16.8h, v3.8b, v6.8b \n" // R 2105 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2106 "uqadd v0.8b, v0.8b, v7.8b \n" 2107 MEMACCESS(1) 2108 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2109 "b.gt 1b \n" 2110 : "+r"(src_rgba), // %0 2111 "+r"(dst_y), // %1 2112 "+r"(width) // %2 2113 : 2114 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2115 ); 2116} 2117 2118void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { 2119 asm volatile ( 2120 "movi v4.8b, #13 \n" // B * 0.1016 coefficient 2121 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2122 "movi v6.8b, #33 \n" // R * 0.2578 coefficient 2123 "movi v7.8b, #16 \n" // Add 16 constant 2124 "1: \n" 2125 MEMACCESS(0) 2126 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2127 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2128 "umull v16.8h, v0.8b, v4.8b \n" // B 2129 "umlal v16.8h, v1.8b, v5.8b \n" // G 2130 "umlal v16.8h, v2.8b, v6.8b \n" // R 2131 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2132 "uqadd v0.8b, v0.8b, v7.8b \n" 2133 MEMACCESS(1) 2134 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2135 "b.gt 1b \n" 2136 : "+r"(src_rgb24), // %0 2137 "+r"(dst_y), // %1 2138 "+r"(width) // %2 2139 : 2140 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2141 ); 2142} 2143 2144void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { 2145 asm volatile ( 2146 "movi v4.8b, #33 \n" // R * 0.2578 coefficient 2147 "movi v5.8b, #65 \n" // G * 0.5078 coefficient 2148 "movi v6.8b, #13 \n" // B * 0.1016 coefficient 2149 "movi v7.8b, #16 \n" // Add 16 constant 2150 "1: \n" 2151 MEMACCESS(0) 2152 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. 2153 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2154 "umull v16.8h, v0.8b, v4.8b \n" // B 2155 "umlal v16.8h, v1.8b, v5.8b \n" // G 2156 "umlal v16.8h, v2.8b, v6.8b \n" // R 2157 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y 2158 "uqadd v0.8b, v0.8b, v7.8b \n" 2159 MEMACCESS(1) 2160 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. 2161 "b.gt 1b \n" 2162 : "+r"(src_raw), // %0 2163 "+r"(dst_y), // %1 2164 "+r"(width) // %2 2165 : 2166 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" 2167 ); 2168} 2169 2170// Bilinear filter 16x2 -> 16x1 2171void InterpolateRow_NEON(uint8* dst_ptr, 2172 const uint8* src_ptr, ptrdiff_t src_stride, 2173 int dst_width, int source_y_fraction) { 2174 int y1_fraction = source_y_fraction; 2175 int y0_fraction = 256 - y1_fraction; 2176 const uint8* src_ptr1 = src_ptr + src_stride; 2177 asm volatile ( 2178 "cmp %w4, #0 \n" 2179 "b.eq 100f \n" 2180 "cmp %w4, #128 \n" 2181 "b.eq 50f \n" 2182 2183 "dup v5.16b, %w4 \n" 2184 "dup v4.16b, %w5 \n" 2185 // General purpose row blend. 2186 "1: \n" 2187 MEMACCESS(1) 2188 "ld1 {v0.16b}, [%1], #16 \n" 2189 MEMACCESS(2) 2190 "ld1 {v1.16b}, [%2], #16 \n" 2191 "subs %w3, %w3, #16 \n" 2192 "umull v2.8h, v0.8b, v4.8b \n" 2193 "umull2 v3.8h, v0.16b, v4.16b \n" 2194 "umlal v2.8h, v1.8b, v5.8b \n" 2195 "umlal2 v3.8h, v1.16b, v5.16b \n" 2196 "rshrn v0.8b, v2.8h, #8 \n" 2197 "rshrn2 v0.16b, v3.8h, #8 \n" 2198 MEMACCESS(0) 2199 "st1 {v0.16b}, [%0], #16 \n" 2200 "b.gt 1b \n" 2201 "b 99f \n" 2202 2203 // Blend 50 / 50. 2204 "50: \n" 2205 MEMACCESS(1) 2206 "ld1 {v0.16b}, [%1], #16 \n" 2207 MEMACCESS(2) 2208 "ld1 {v1.16b}, [%2], #16 \n" 2209 "subs %w3, %w3, #16 \n" 2210 "urhadd v0.16b, v0.16b, v1.16b \n" 2211 MEMACCESS(0) 2212 "st1 {v0.16b}, [%0], #16 \n" 2213 "b.gt 50b \n" 2214 "b 99f \n" 2215 2216 // Blend 100 / 0 - Copy row unchanged. 2217 "100: \n" 2218 MEMACCESS(1) 2219 "ld1 {v0.16b}, [%1], #16 \n" 2220 "subs %w3, %w3, #16 \n" 2221 MEMACCESS(0) 2222 "st1 {v0.16b}, [%0], #16 \n" 2223 "b.gt 100b \n" 2224 2225 "99: \n" 2226 : "+r"(dst_ptr), // %0 2227 "+r"(src_ptr), // %1 2228 "+r"(src_ptr1), // %2 2229 "+r"(dst_width), // %3 2230 "+r"(y1_fraction), // %4 2231 "+r"(y0_fraction) // %5 2232 : 2233 : "cc", "memory", "v0", "v1", "v3", "v4", "v5" 2234 ); 2235} 2236 2237// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr 2238void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2239 uint8* dst_argb, int width) { 2240 asm volatile ( 2241 "subs %w3, %w3, #8 \n" 2242 "b.lt 89f \n" 2243 // Blend 8 pixels. 2244 "8: \n" 2245 MEMACCESS(0) 2246 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels 2247 MEMACCESS(1) 2248 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels 2249 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2250 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2251 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2252 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2253 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2254 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2255 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2256 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2257 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2258 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2259 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2260 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2261 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2262 "movi v3.8b, #255 \n" // a = 255 2263 MEMACCESS(2) 2264 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2265 "b.ge 8b \n" 2266 2267 "89: \n" 2268 "adds %w3, %w3, #8-1 \n" 2269 "b.lt 99f \n" 2270 2271 // Blend 1 pixels. 2272 "1: \n" 2273 MEMACCESS(0) 2274 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. 2275 MEMACCESS(1) 2276 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. 2277 "subs %w3, %w3, #1 \n" // 1 processed per loop. 2278 "umull v16.8h, v4.8b, v3.8b \n" // db * a 2279 "umull v17.8h, v5.8b, v3.8b \n" // dg * a 2280 "umull v18.8h, v6.8b, v3.8b \n" // dr * a 2281 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 2282 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 2283 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 2284 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) 2285 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) 2286 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) 2287 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb 2288 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg 2289 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr 2290 "movi v3.8b, #255 \n" // a = 255 2291 MEMACCESS(2) 2292 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. 2293 "b.ge 1b \n" 2294 2295 "99: \n" 2296 2297 : "+r"(src_argb0), // %0 2298 "+r"(src_argb1), // %1 2299 "+r"(dst_argb), // %2 2300 "+r"(width) // %3 2301 : 2302 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2303 "v16", "v17", "v18" 2304 ); 2305} 2306 2307// Attenuate 8 pixels at a time. 2308void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2309 asm volatile ( 2310 // Attenuate 8 pixels. 2311 "1: \n" 2312 MEMACCESS(0) 2313 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels 2314 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2315 "umull v4.8h, v0.8b, v3.8b \n" // b * a 2316 "umull v5.8h, v1.8b, v3.8b \n" // g * a 2317 "umull v6.8h, v2.8b, v3.8b \n" // r * a 2318 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 2319 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 2320 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 2321 MEMACCESS(1) 2322 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels 2323 "b.gt 1b \n" 2324 : "+r"(src_argb), // %0 2325 "+r"(dst_argb), // %1 2326 "+r"(width) // %2 2327 : 2328 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2329 ); 2330} 2331 2332// Quantize 8 ARGB pixels (32 bytes). 2333// dst = (dst * scale >> 16) * interval_size + interval_offset; 2334void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, 2335 int interval_offset, int width) { 2336 asm volatile ( 2337 "dup v4.8h, %w2 \n" 2338 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 2339 "dup v5.8h, %w3 \n" // interval multiply. 2340 "dup v6.8h, %w4 \n" // interval add 2341 2342 // 8 pixel loop. 2343 "1: \n" 2344 MEMACCESS(0) 2345 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. 2346 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2347 "uxtl v0.8h, v0.8b \n" // b (0 .. 255) 2348 "uxtl v1.8h, v1.8b \n" 2349 "uxtl v2.8h, v2.8b \n" 2350 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale 2351 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g 2352 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r 2353 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size 2354 "mul v1.8h, v1.8h, v5.8h \n" // g 2355 "mul v2.8h, v2.8h, v5.8h \n" // r 2356 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset 2357 "add v1.8h, v1.8h, v6.8h \n" // g 2358 "add v2.8h, v2.8h, v6.8h \n" // r 2359 "uqxtn v0.8b, v0.8h \n" 2360 "uqxtn v1.8b, v1.8h \n" 2361 "uqxtn v2.8b, v2.8h \n" 2362 MEMACCESS(0) 2363 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels 2364 "b.gt 1b \n" 2365 : "+r"(dst_argb), // %0 2366 "+r"(width) // %1 2367 : "r"(scale), // %2 2368 "r"(interval_size), // %3 2369 "r"(interval_offset) // %4 2370 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" 2371 ); 2372} 2373 2374// Shade 8 pixels at a time by specified value. 2375// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. 2376// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. 2377void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, 2378 uint32 value) { 2379 asm volatile ( 2380 "dup v0.4s, %w3 \n" // duplicate scale value. 2381 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. 2382 "ushr v0.8h, v0.8h, #1 \n" // scale / 2. 2383 2384 // 8 pixel loop. 2385 "1: \n" 2386 MEMACCESS(0) 2387 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2388 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2389 "uxtl v4.8h, v4.8b \n" // b (0 .. 255) 2390 "uxtl v5.8h, v5.8b \n" 2391 "uxtl v6.8h, v6.8b \n" 2392 "uxtl v7.8h, v7.8b \n" 2393 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 2394 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g 2395 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r 2396 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a 2397 "uqxtn v4.8b, v4.8h \n" 2398 "uqxtn v5.8b, v5.8h \n" 2399 "uqxtn v6.8b, v6.8h \n" 2400 "uqxtn v7.8b, v7.8h \n" 2401 MEMACCESS(1) 2402 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels 2403 "b.gt 1b \n" 2404 : "+r"(src_argb), // %0 2405 "+r"(dst_argb), // %1 2406 "+r"(width) // %2 2407 : "r"(value) // %3 2408 : "cc", "memory", "v0", "v4", "v5", "v6", "v7" 2409 ); 2410} 2411 2412// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 2413// Similar to ARGBToYJ but stores ARGB. 2414// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; 2415void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { 2416 asm volatile ( 2417 "movi v24.8b, #15 \n" // B * 0.11400 coefficient 2418 "movi v25.8b, #75 \n" // G * 0.58700 coefficient 2419 "movi v26.8b, #38 \n" // R * 0.29900 coefficient 2420 "1: \n" 2421 MEMACCESS(0) 2422 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2423 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2424 "umull v4.8h, v0.8b, v24.8b \n" // B 2425 "umlal v4.8h, v1.8b, v25.8b \n" // G 2426 "umlal v4.8h, v2.8b, v26.8b \n" // R 2427 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B 2428 "orr v1.8b, v0.8b, v0.8b \n" // G 2429 "orr v2.8b, v0.8b, v0.8b \n" // R 2430 MEMACCESS(1) 2431 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. 2432 "b.gt 1b \n" 2433 : "+r"(src_argb), // %0 2434 "+r"(dst_argb), // %1 2435 "+r"(width) // %2 2436 : 2437 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" 2438 ); 2439} 2440 2441// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 2442// b = (r * 35 + g * 68 + b * 17) >> 7 2443// g = (r * 45 + g * 88 + b * 22) >> 7 2444// r = (r * 50 + g * 98 + b * 24) >> 7 2445 2446void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { 2447 asm volatile ( 2448 "movi v20.8b, #17 \n" // BB coefficient 2449 "movi v21.8b, #68 \n" // BG coefficient 2450 "movi v22.8b, #35 \n" // BR coefficient 2451 "movi v24.8b, #22 \n" // GB coefficient 2452 "movi v25.8b, #88 \n" // GG coefficient 2453 "movi v26.8b, #45 \n" // GR coefficient 2454 "movi v28.8b, #24 \n" // BB coefficient 2455 "movi v29.8b, #98 \n" // BG coefficient 2456 "movi v30.8b, #50 \n" // BR coefficient 2457 "1: \n" 2458 MEMACCESS(0) 2459 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. 2460 "subs %w1, %w1, #8 \n" // 8 processed per loop. 2461 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B 2462 "umlal v4.8h, v1.8b, v21.8b \n" // G 2463 "umlal v4.8h, v2.8b, v22.8b \n" // R 2464 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G 2465 "umlal v5.8h, v1.8b, v25.8b \n" // G 2466 "umlal v5.8h, v2.8b, v26.8b \n" // R 2467 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R 2468 "umlal v6.8h, v1.8b, v29.8b \n" // G 2469 "umlal v6.8h, v2.8b, v30.8b \n" // R 2470 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B 2471 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G 2472 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R 2473 MEMACCESS(0) 2474 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. 2475 "b.gt 1b \n" 2476 : "+r"(dst_argb), // %0 2477 "+r"(width) // %1 2478 : 2479 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 2480 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" 2481 ); 2482} 2483 2484// Tranform 8 ARGB pixels (32 bytes) with color matrix. 2485// TODO(fbarchard): Was same as Sepia except matrix is provided. This function 2486// needs to saturate. Consider doing a non-saturating version. 2487void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, 2488 const int8* matrix_argb, int width) { 2489 asm volatile ( 2490 MEMACCESS(3) 2491 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. 2492 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. 2493 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. 2494 2495 "1: \n" 2496 MEMACCESS(0) 2497 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. 2498 "subs %w2, %w2, #8 \n" // 8 processed per loop. 2499 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit 2500 "uxtl v17.8h, v17.8b \n" // g 2501 "uxtl v18.8h, v18.8b \n" // r 2502 "uxtl v19.8h, v19.8b \n" // a 2503 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B 2504 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G 2505 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R 2506 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A 2507 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B 2508 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G 2509 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R 2510 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A 2511 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2512 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2513 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2514 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2515 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B 2516 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G 2517 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R 2518 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A 2519 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2520 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2521 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2522 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2523 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B 2524 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G 2525 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R 2526 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A 2527 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B 2528 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G 2529 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R 2530 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A 2531 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B 2532 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G 2533 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R 2534 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A 2535 MEMACCESS(1) 2536 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels. 2537 "b.gt 1b \n" 2538 : "+r"(src_argb), // %0 2539 "+r"(dst_argb), // %1 2540 "+r"(width) // %2 2541 : "r"(matrix_argb) // %3 2542 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", 2543 "v18", "v19", "v22", "v23", "v24", "v25" 2544 ); 2545} 2546 2547// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. 2548// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 2549void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2550 uint8* dst_argb, int width) { 2551 asm volatile ( 2552 // 8 pixel loop. 2553 "1: \n" 2554 MEMACCESS(0) 2555 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2556 MEMACCESS(1) 2557 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2558 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2559 "umull v0.8h, v0.8b, v4.8b \n" // multiply B 2560 "umull v1.8h, v1.8b, v5.8b \n" // multiply G 2561 "umull v2.8h, v2.8b, v6.8b \n" // multiply R 2562 "umull v3.8h, v3.8b, v7.8b \n" // multiply A 2563 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B 2564 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G 2565 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R 2566 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A 2567 MEMACCESS(2) 2568 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2569 "b.gt 1b \n" 2570 2571 : "+r"(src_argb0), // %0 2572 "+r"(src_argb1), // %1 2573 "+r"(dst_argb), // %2 2574 "+r"(width) // %3 2575 : 2576 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2577 ); 2578} 2579 2580// Add 2 rows of ARGB pixels together, 8 pixels at a time. 2581void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2582 uint8* dst_argb, int width) { 2583 asm volatile ( 2584 // 8 pixel loop. 2585 "1: \n" 2586 MEMACCESS(0) 2587 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2588 MEMACCESS(1) 2589 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2590 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2591 "uqadd v0.8b, v0.8b, v4.8b \n" 2592 "uqadd v1.8b, v1.8b, v5.8b \n" 2593 "uqadd v2.8b, v2.8b, v6.8b \n" 2594 "uqadd v3.8b, v3.8b, v7.8b \n" 2595 MEMACCESS(2) 2596 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2597 "b.gt 1b \n" 2598 2599 : "+r"(src_argb0), // %0 2600 "+r"(src_argb1), // %1 2601 "+r"(dst_argb), // %2 2602 "+r"(width) // %3 2603 : 2604 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2605 ); 2606} 2607 2608// Subtract 2 rows of ARGB pixels, 8 pixels at a time. 2609void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, 2610 uint8* dst_argb, int width) { 2611 asm volatile ( 2612 // 8 pixel loop. 2613 "1: \n" 2614 MEMACCESS(0) 2615 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. 2616 MEMACCESS(1) 2617 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. 2618 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2619 "uqsub v0.8b, v0.8b, v4.8b \n" 2620 "uqsub v1.8b, v1.8b, v5.8b \n" 2621 "uqsub v2.8b, v2.8b, v6.8b \n" 2622 "uqsub v3.8b, v3.8b, v7.8b \n" 2623 MEMACCESS(2) 2624 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2625 "b.gt 1b \n" 2626 2627 : "+r"(src_argb0), // %0 2628 "+r"(src_argb1), // %1 2629 "+r"(dst_argb), // %2 2630 "+r"(width) // %3 2631 : 2632 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" 2633 ); 2634} 2635 2636// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 2637// A = 255 2638// R = Sobel 2639// G = Sobel 2640// B = Sobel 2641void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2642 uint8* dst_argb, int width) { 2643 asm volatile ( 2644 "movi v3.8b, #255 \n" // alpha 2645 // 8 pixel loop. 2646 "1: \n" 2647 MEMACCESS(0) 2648 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. 2649 MEMACCESS(1) 2650 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. 2651 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2652 "uqadd v0.8b, v0.8b, v1.8b \n" // add 2653 "orr v1.8b, v0.8b, v0.8b \n" 2654 "orr v2.8b, v0.8b, v0.8b \n" 2655 MEMACCESS(2) 2656 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2657 "b.gt 1b \n" 2658 : "+r"(src_sobelx), // %0 2659 "+r"(src_sobely), // %1 2660 "+r"(dst_argb), // %2 2661 "+r"(width) // %3 2662 : 2663 : "cc", "memory", "v0", "v1", "v2", "v3" 2664 ); 2665} 2666 2667// Adds Sobel X and Sobel Y and stores Sobel into plane. 2668void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2669 uint8* dst_y, int width) { 2670 asm volatile ( 2671 // 16 pixel loop. 2672 "1: \n" 2673 MEMACCESS(0) 2674 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. 2675 MEMACCESS(1) 2676 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. 2677 "subs %w3, %w3, #16 \n" // 16 processed per loop. 2678 "uqadd v0.16b, v0.16b, v1.16b \n" // add 2679 MEMACCESS(2) 2680 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. 2681 "b.gt 1b \n" 2682 : "+r"(src_sobelx), // %0 2683 "+r"(src_sobely), // %1 2684 "+r"(dst_y), // %2 2685 "+r"(width) // %3 2686 : 2687 : "cc", "memory", "v0", "v1" 2688 ); 2689} 2690 2691// Mixes Sobel X, Sobel Y and Sobel into ARGB. 2692// A = 255 2693// R = Sobel X 2694// G = Sobel 2695// B = Sobel Y 2696void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, 2697 uint8* dst_argb, int width) { 2698 asm volatile ( 2699 "movi v3.8b, #255 \n" // alpha 2700 // 8 pixel loop. 2701 "1: \n" 2702 MEMACCESS(0) 2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. 2704 MEMACCESS(1) 2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. 2706 "subs %w3, %w3, #8 \n" // 8 processed per loop. 2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add 2708 MEMACCESS(2) 2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels 2710 "b.gt 1b \n" 2711 : "+r"(src_sobelx), // %0 2712 "+r"(src_sobely), // %1 2713 "+r"(dst_argb), // %2 2714 "+r"(width) // %3 2715 : 2716 : "cc", "memory", "v0", "v1", "v2", "v3" 2717 ); 2718} 2719 2720// SobelX as a matrix is 2721// -1 0 1 2722// -2 0 2 2723// -1 0 1 2724void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, 2725 const uint8* src_y2, uint8* dst_sobelx, int width) { 2726 asm volatile ( 2727 "1: \n" 2728 MEMACCESS(0) 2729 "ld1 {v0.8b}, [%0],%5 \n" // top 2730 MEMACCESS(0) 2731 "ld1 {v1.8b}, [%0],%6 \n" 2732 "usubl v0.8h, v0.8b, v1.8b \n" 2733 MEMACCESS(1) 2734 "ld1 {v2.8b}, [%1],%5 \n" // center * 2 2735 MEMACCESS(1) 2736 "ld1 {v3.8b}, [%1],%6 \n" 2737 "usubl v1.8h, v2.8b, v3.8b \n" 2738 "add v0.8h, v0.8h, v1.8h \n" 2739 "add v0.8h, v0.8h, v1.8h \n" 2740 MEMACCESS(2) 2741 "ld1 {v2.8b}, [%2],%5 \n" // bottom 2742 MEMACCESS(2) 2743 "ld1 {v3.8b}, [%2],%6 \n" 2744 "subs %w4, %w4, #8 \n" // 8 pixels 2745 "usubl v1.8h, v2.8b, v3.8b \n" 2746 "add v0.8h, v0.8h, v1.8h \n" 2747 "abs v0.8h, v0.8h \n" 2748 "uqxtn v0.8b, v0.8h \n" 2749 MEMACCESS(3) 2750 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx 2751 "b.gt 1b \n" 2752 : "+r"(src_y0), // %0 2753 "+r"(src_y1), // %1 2754 "+r"(src_y2), // %2 2755 "+r"(dst_sobelx), // %3 2756 "+r"(width) // %4 2757 : "r"(2LL), // %5 2758 "r"(6LL) // %6 2759 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2760 ); 2761} 2762 2763// SobelY as a matrix is 2764// -1 -2 -1 2765// 0 0 0 2766// 1 2 1 2767void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, 2768 uint8* dst_sobely, int width) { 2769 asm volatile ( 2770 "1: \n" 2771 MEMACCESS(0) 2772 "ld1 {v0.8b}, [%0],%4 \n" // left 2773 MEMACCESS(1) 2774 "ld1 {v1.8b}, [%1],%4 \n" 2775 "usubl v0.8h, v0.8b, v1.8b \n" 2776 MEMACCESS(0) 2777 "ld1 {v2.8b}, [%0],%4 \n" // center * 2 2778 MEMACCESS(1) 2779 "ld1 {v3.8b}, [%1],%4 \n" 2780 "usubl v1.8h, v2.8b, v3.8b \n" 2781 "add v0.8h, v0.8h, v1.8h \n" 2782 "add v0.8h, v0.8h, v1.8h \n" 2783 MEMACCESS(0) 2784 "ld1 {v2.8b}, [%0],%5 \n" // right 2785 MEMACCESS(1) 2786 "ld1 {v3.8b}, [%1],%5 \n" 2787 "subs %w3, %w3, #8 \n" // 8 pixels 2788 "usubl v1.8h, v2.8b, v3.8b \n" 2789 "add v0.8h, v0.8h, v1.8h \n" 2790 "abs v0.8h, v0.8h \n" 2791 "uqxtn v0.8b, v0.8h \n" 2792 MEMACCESS(2) 2793 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely 2794 "b.gt 1b \n" 2795 : "+r"(src_y0), // %0 2796 "+r"(src_y1), // %1 2797 "+r"(dst_sobely), // %2 2798 "+r"(width) // %3 2799 : "r"(1LL), // %4 2800 "r"(6LL) // %5 2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List 2802 ); 2803} 2804#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) 2805 2806#ifdef __cplusplus 2807} // extern "C" 2808} // namespace libyuv 2809#endif 2810